From f63d33da0a517a9a7096e0e67defb50c5995dd41 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Mon, 10 Nov 2025 21:24:39 +0100
Subject: [PATCH 01/97] Reland "[clang] Refactor option-related code from
 clangDriver into new clangOptions library" (#167374)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This relands #167348.

The original PR was reverted due to a reported build failure, which was
later diagnosed as a local issue in the developer’s checkout or build
state. See discussion here:
https://github.com/llvm/llvm-project/pull/163659#discussion_r2511546964

No additional changes have been made in this reland.
---
 clang-tools-extra/clangd/CMakeLists.txt       |   1 +
 clang-tools-extra/clangd/CompileCommands.cpp  |  27 +-
 clang-tools-extra/modularize/CMakeLists.txt   |   1 +
 .../modularize/CoverageChecker.cpp            |   6 +-
 clang-tools-extra/modularize/Modularize.cpp   |   4 +-
 .../modularize/ModularizeUtilities.cpp        |   6 +-
 clang-tools-extra/pp-trace/CMakeLists.txt     |   1 +
 clang-tools-extra/pp-trace/PPTrace.cpp        |   2 +-
 clang/docs/CMakeLists.txt                     |   2 +-
 clang/docs/InternalsManual.rst                |   8 +-
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/include/clang/CMakeLists.txt            |   2 +-
 clang/include/clang/Driver/Driver.h           |   2 +-
 clang/include/clang/Frontend/Utils.h          |   2 +-
 .../clang/{Driver => Options}/CMakeLists.txt  |   0
 .../{Driver => Options}/ClangOptionDocs.td    |   0
 .../clang/{Driver => Options}/OptionUtils.h   |   6 +-
 .../clang/{Driver => Options}/Options.h       |  20 +-
 .../clang/{Driver => Options}/Options.td      |   0
 clang/include/module.modulemap                |   1 +
 clang/lib/CMakeLists.txt                      |   1 +
 clang/lib/Driver/CMakeLists.txt               |   3 +-
 clang/lib/Driver/Compilation.cpp              |   2 +-
 clang/lib/Driver/Driver.cpp                   |   2 +-
 clang/lib/Driver/SanitizerArgs.cpp            |   2 +-
 clang/lib/Driver/ToolChain.cpp                |   6 +-
 clang/lib/Driver/ToolChains/AIX.cpp           |   6 +-
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |  21 +-
 clang/lib/Driver/ToolChains/AMDGPU.h          |   2 +-
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp  |   2 +-
 clang/lib/Driver/ToolChains/AVR.cpp           |   2 +-
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp  |   4 +-
 clang/lib/Driver/ToolChains/Arch/ARM.cpp      |   4 +-
 clang/lib/Driver/ToolChains/Arch/CSKY.cpp     |   6 +-
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |   5 +-
 clang/lib/Driver/ToolChains/Arch/M68k.cpp     |  16 +-
 clang/lib/Driver/ToolChains/Arch/Mips.cpp     |   5 +-
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      |   2 +-
 clang/lib/Driver/ToolChains/Arch/RISCV.cpp    |   2 +-
 clang/lib/Driver/ToolChains/Arch/Sparc.cpp    |   4 +-
 clang/lib/Driver/ToolChains/Arch/SystemZ.cpp  |  10 +-
 clang/lib/Driver/ToolChains/Arch/VE.cpp       |   2 +-
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |  18 +-
 clang/lib/Driver/ToolChains/BareMetal.cpp     |   4 +-
 clang/lib/Driver/ToolChains/CSKYToolChain.cpp |   2 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  13 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  24 +-
 clang/lib/Driver/ToolChains/CrossWindows.cpp  |   2 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          |   8 +-
 clang/lib/Driver/ToolChains/Cygwin.cpp        |   4 +-
 clang/lib/Driver/ToolChains/Darwin.cpp        |   6 +-
 clang/lib/Driver/ToolChains/DragonFly.cpp     |   4 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |   4 +-
 clang/lib/Driver/ToolChains/FreeBSD.cpp       |   4 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp       |   4 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |  10 +-
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   2 +-
 clang/lib/Driver/ToolChains/HIPSPV.cpp        |   4 +-
 clang/lib/Driver/ToolChains/HIPUtility.cpp    |   2 +-
 clang/lib/Driver/ToolChains/Hexagon.cpp       |   2 +-
 clang/lib/Driver/ToolChains/Hurd.cpp          |   4 +-
 clang/lib/Driver/ToolChains/Linux.cpp         |   4 +-
 clang/lib/Driver/ToolChains/MSP430.cpp        |   2 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |   2 +-
 clang/lib/Driver/ToolChains/Managarm.cpp      |   4 +-
 clang/lib/Driver/ToolChains/MinGW.cpp         |   2 +-
 clang/lib/Driver/ToolChains/MipsLinux.cpp     |   4 +-
 clang/lib/Driver/ToolChains/NetBSD.cpp        |   4 +-
 clang/lib/Driver/ToolChains/OHOS.cpp          |   4 +-
 clang/lib/Driver/ToolChains/OpenBSD.cpp       |   4 +-
 clang/lib/Driver/ToolChains/PPCFreeBSD.cpp    |   4 +-
 clang/lib/Driver/ToolChains/PPCLinux.cpp      |   4 +-
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |   2 +-
 clang/lib/Driver/ToolChains/SPIRV.cpp         |   2 +-
 clang/lib/Driver/ToolChains/SYCL.cpp          |   2 +-
 clang/lib/Driver/ToolChains/Solaris.cpp       |   4 +-
 clang/lib/Driver/ToolChains/UEFI.cpp          |   2 +-
 clang/lib/Driver/ToolChains/VEToolchain.cpp   |   6 +-
 clang/lib/Driver/ToolChains/WebAssembly.cpp   |   6 +-
 clang/lib/Driver/ToolChains/XCore.cpp         |   6 +-
 clang/lib/Driver/ToolChains/ZOS.cpp           |   2 +-
 clang/lib/Driver/XRayArgs.cpp                 |   2 +-
 clang/lib/Frontend/CMakeLists.txt             |   1 +
 clang/lib/Frontend/CompilerInvocation.cpp     |  54 +-
 .../CreateInvocationFromCommandLine.cpp       |   6 +-
 clang/lib/FrontendTool/CMakeLists.txt         |   1 +
 .../ExecuteCompilerInvocation.cpp             |   6 +-
 clang/lib/Interpreter/Interpreter.cpp         |   4 +-
 clang/lib/Interpreter/InterpreterUtils.h      |   2 +-
 clang/lib/Options/CMakeLists.txt              |  18 +
 .../lib/{Driver => Options}/DriverOptions.cpp |  19 +-
 clang/lib/{Driver => Options}/OptionUtils.cpp |   2 +-
 clang/lib/Tooling/CMakeLists.txt              |   1 +
 .../InterpolatingCompilationDatabase.cpp      |  12 +-
 clang/lib/Tooling/Tooling.cpp                 |  12 +-
 clang/tools/clang-check/CMakeLists.txt        |   1 +
 clang/tools/clang-check/ClangCheck.cpp        |   4 +-
 clang/tools/clang-installapi/CMakeLists.txt   |   1 +
 .../clang-installapi/ClangInstallAPI.cpp      |   4 +-
 clang/tools/clang-installapi/Options.cpp      |  55 +-
 clang/tools/driver/CMakeLists.txt             |   1 +
 clang/tools/driver/cc1_main.cpp               |   2 +-
 clang/tools/driver/cc1as_main.cpp             |   8 +-
 clang/tools/driver/driver.cpp                 |   2 +-
 clang/unittests/Driver/DXCModeTest.cpp        |   4 +-
 clang/www/OpenProjects.html                   |   2 +-
 flang/docs/CMakeLists.txt                     |   2 +-
 flang/docs/FlangDriver.md                     |   8 +-
 flang/lib/Frontend/CMakeLists.txt             |   1 +
 flang/lib/Frontend/CompilerInvocation.cpp     | 547 ++++++++----------
 flang/lib/FrontendTool/CMakeLists.txt         |   1 +
 .../ExecuteCompilerInvocation.cpp             |   6 +-
 flang/tools/flang-driver/CMakeLists.txt       |   1 +
 flang/tools/flang-driver/driver.cpp           |   4 +-
 .../Platform/MacOSX/PlatformDarwin.cpp        |   4 +-
 llvm/include/llvm/MC/MCAsmInfo.h              |   2 +-
 llvm/include/llvm/Option/Arg.h                |   2 +-
 .../llvm-project-overlay/clang/BUILD.bazel    |   4 +-
 118 files changed, 590 insertions(+), 610 deletions(-)
 rename clang/include/clang/{Driver => Options}/CMakeLists.txt (100%)
 rename clang/include/clang/{Driver => Options}/ClangOptionDocs.td (100%)
 rename clang/include/clang/{Driver => Options}/OptionUtils.h (94%)
 rename clang/include/clang/{Driver => Options}/Options.h (83%)
 rename clang/include/clang/{Driver => Options}/Options.td (100%)
 create mode 100644 clang/lib/Options/CMakeLists.txt
 rename clang/lib/{Driver => Options}/DriverOptions.cpp (76%)
 rename clang/lib/{Driver => Options}/OptionUtils.cpp (97%)

diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index fb3f05329be21..d7ec853af862f 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -165,6 +165,7 @@ clang_target_link_libraries(clangDaemon
   clangBasic
   clangDependencyScanning
   clangDriver
+  clangOptions
   clangFormat
   clangFrontend
   clangIndex
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index c1be93730129a..7990f2719e9a0 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -11,8 +11,8 @@
 #include "support/Logger.h"
 #include "support/Trace.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   if (Cmd.empty())
     return;
 
-  auto &OptTable = clang::driver::getDriverOptTable();
+  auto &OptTable = getDriverOptTable();
   // OriginalArgs needs to outlive ArgList.
   llvm::SmallVector<const char *, 16> OriginalArgs;
   OriginalArgs.reserve(Cmd.size());
@@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   llvm::opt::InputArgList ArgList;
   ArgList = OptTable.ParseArgs(
       llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount,
-      llvm::opt::Visibility(IsCLMode ? driver::options::CLOption
-                                     : driver::options::ClangOption));
+      llvm::opt::Visibility(IsCLMode ? options::CLOption
+                                     : options::ClangOption));
 
   llvm::SmallVector<unsigned, 1> IndicesToDrop;
   // Having multiple architecture options (e.g. when building fat binaries)
@@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // As there are no signals to figure out which one user actually wants. They
   // can explicitly specify one through `CompileFlags.Add` if need be.
   unsigned ArchOptCount = 0;
-  for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) {
+  for (auto *Input : ArgList.filtered(options::OPT_arch)) {
     ++ArchOptCount;
     for (auto I = 0U; I <= Input->getNumValues(); ++I)
       IndicesToDrop.push_back(Input->getIndex() + I);
@@ -262,13 +262,12 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // explicitly at the end of the flags. This ensures modifications done in the
   // following steps apply in more cases (like setting -x, which only affects
   // inputs that come after it).
-  for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) {
+  for (auto *Input : ArgList.filtered(options::OPT_INPUT)) {
     SawInput(Input->getValue(0));
     IndicesToDrop.push_back(Input->getIndex());
   }
   // Anything after `--` is also treated as input, drop them as well.
-  if (auto *DashDash =
-          ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) {
+  if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) {
     auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0]
     // Another +1 so we don't treat the `--` itself as an input.
     for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I)
@@ -424,11 +423,11 @@ DriverMode getDriverMode(const std::vector<std::string> &Args) {
 // Returns the set of DriverModes where an option may be used.
 unsigned char getModes(const llvm::opt::Option &Opt) {
   unsigned char Result = DM_None;
-  if (Opt.hasVisibilityFlag(driver::options::ClangOption))
+  if (Opt.hasVisibilityFlag(options::ClangOption))
     Result |= DM_GCC;
-  if (Opt.hasVisibilityFlag(driver::options::CC1Option))
+  if (Opt.hasVisibilityFlag(options::CC1Option))
     Result |= DM_CC1;
-  if (Opt.hasVisibilityFlag(driver::options::CLOption))
+  if (Opt.hasVisibilityFlag(options::CLOption))
     Result |= DM_CL;
   return Result;
 }
@@ -442,8 +441,8 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
   using TableTy =
       llvm::StringMap<llvm::SmallVector<Rule, 4>, llvm::BumpPtrAllocator>;
   static TableTy *Table = [] {
-    auto &DriverTable = driver::getDriverOptTable();
-    using DriverID = clang::driver::options::ID;
+    auto &DriverTable = getDriverOptTable();
+    using DriverID = clang::options::ID;
 
     // Collect sets of aliases, so we can treat -foo and -foo= as synonyms.
     // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I].
@@ -468,7 +467,7 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
                FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS,       \
                METAVAR, VALUES, SUBCOMMANDIDS_OFFSET)                          \
   {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS},
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
     };
     for (auto &E : AliasTable)
diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt
index eb5383c3ad44e..a775b790a3147 100644
--- a/clang-tools-extra/modularize/CMakeLists.txt
+++ b/clang-tools-extra/modularize/CMakeLists.txt
@@ -20,6 +20,7 @@ clang_target_link_libraries(modularize
   clangAST
   clangBasic
   clangDriver
+  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index 1345a6ef8f489..d80d78c64c6e2 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -50,18 +50,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CoverageChecker.h"
 #include "ModularizeUtilities.h"
 #include "clang/AST/ASTConsumer.h"
-#include "CoverageChecker.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Option.h"
@@ -73,7 +73,7 @@
 using namespace Modularize;
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace clang::tooling;
 namespace cl = llvm::cl;
 namespace sys = llvm::sys;
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 376ad0c7875bf..33966b44f719a 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -231,11 +231,11 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
@@ -254,7 +254,7 @@
 
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace clang::tooling;
 using namespace llvm;
 using namespace llvm::opt;
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 4dd84feac5df4..6978a6b2fe1b7 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -12,17 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ModularizeUtilities.h"
+#include "CoverageChecker.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
-#include "CoverageChecker.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include "ModularizeUtilities.h"
 
 using namespace clang;
 using namespace llvm;
diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt
index 1323adbc35269..da36582ee0234 100644
--- a/clang-tools-extra/pp-trace/CMakeLists.txt
+++ b/clang-tools-extra/pp-trace/CMakeLists.txt
@@ -14,6 +14,7 @@ clang_target_link_libraries(pp-trace
   PRIVATE
   clangAST
   clangBasic
+  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp
index 0b078c49a55b7..ba5a06a26830d 100644
--- a/clang-tools-extra/pp-trace/PPTrace.cpp
+++ b/clang-tools-extra/pp-trace/PPTrace.cpp
@@ -28,11 +28,11 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/Execution.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt
index 1f06c040c96cb..9469a832adb62 100644
--- a/clang/docs/CMakeLists.txt
+++ b/clang/docs/CMakeLists.txt
@@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX)
     # Generated files
     gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}")
     gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}")
-    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}")
+    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}")
 
     # Another generated file from a different source
     set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools)
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index eff46ab46e1ca..a849d05eb7ae9 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -667,7 +667,7 @@ Command Line Interface
 ----------------------
 
 The command line interface of the Clang ``-cc1`` frontend is defined alongside
-the driver options in ``clang/Driver/Options.td``. The information making up an
+the driver options in ``clang/Options/Options.td``. The information making up an
 option definition includes its prefix and name (for example ``-std=``), form and
 position of the option value, help text, aliases and more. Each option may
 belong to a certain group and can be marked with zero or more flags. Options
@@ -712,7 +712,7 @@ variable for the option value:
     }
 
 Next, declare the command line interface of the option in the tablegen file
-``clang/include/clang/Driver/Options.td``. This is done by instantiating the
+``clang/include/clang/Options/Options.td``. This is done by instantiating the
 ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The
 instance is typically created through one of the helper classes that encode the
 acceptable ways to specify the option value on the command line:
@@ -906,7 +906,7 @@ command line:
                                   SHOULD_PARSE, KEYPATH, DEFAULT_VALUE,          \
                                   IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER,      \
                                   MERGER, TABLE_INDEX)
-  #include "clang/Driver/Options.inc"
+  #include "clang/Options/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
@@ -925,7 +925,7 @@ command line:
     GENERATE_OPTION_WITH_MARSHALLING(                                            \
         Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE,    \
         IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX)
-  #include "clang/Driver/Options.inc"
+  #include "clang/Options/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e9c0723f6787f..c43f236aab319 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -79,6 +79,9 @@ Potentially Breaking Changes
     void foo(void) {
       return ({ 1;; });
     }
+- Downstream projects that previously linked only against ``clangDriver`` may
+  now (also) need to link against the new ``clangOptions`` library, since
+  options-related code has been moved out of the Driver into a separate library.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt
index 47ac70cd21690..77a44e4c48de5 100644
--- a/clang/include/clang/CMakeLists.txt
+++ b/clang/include/clang/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(Basic)
 if(CLANG_ENABLE_CIR)
   add_subdirectory(CIR)
 endif()
-add_subdirectory(Driver)
+add_subdirectory(Options)
 add_subdirectory(Parse)
 add_subdirectory(Sema)
 add_subdirectory(Serialization)
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index b9b187ada8add..aa86bffb802a4 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -15,11 +15,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 49fd920d1ec43..ed2703c76f18d 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -15,8 +15,8 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/Driver/OptionUtils.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
+#include "clang/Options/OptionUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Driver/CMakeLists.txt b/clang/include/clang/Options/CMakeLists.txt
similarity index 100%
rename from clang/include/clang/Driver/CMakeLists.txt
rename to clang/include/clang/Options/CMakeLists.txt
diff --git a/clang/include/clang/Driver/ClangOptionDocs.td b/clang/include/clang/Options/ClangOptionDocs.td
similarity index 100%
rename from clang/include/clang/Driver/ClangOptionDocs.td
rename to clang/include/clang/Options/ClangOptionDocs.td
diff --git a/clang/include/clang/Driver/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h
similarity index 94%
rename from clang/include/clang/Driver/OptionUtils.h
rename to clang/include/clang/Options/OptionUtils.h
index 922f536bf33ea..83c48bd7d6843 100644
--- a/clang/include/clang/Driver/OptionUtils.h
+++ b/clang/include/clang/Options/OptionUtils.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H
-#define LLVM_CLANG_DRIVER_OPTIONUTILS_H
+#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H
+#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
@@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args,
 
 } // namespace clang
 
-#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H
+#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H
diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Options/Options.h
similarity index 83%
rename from clang/include/clang/Driver/Options.h
rename to clang/include/clang/Options/Options.h
index 0797410e9940e..ac98699001965 100644
--- a/clang/include/clang/Driver/Options.h
+++ b/clang/include/clang/Options/Options.h
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_DRIVER_OPTIONS_H
-#define LLVM_CLANG_DRIVER_OPTIONS_H
+#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H
+#define LLVM_CLANG_OPTIONS_OPTIONS_H
 
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 
 namespace clang {
-namespace driver {
 
 namespace options {
 /// Flags specifically for clang options.  Must not overlap with
@@ -42,16 +41,15 @@ enum ClangVisibility {
 };
 
 enum ID {
-    OPT_INVALID = 0, // This is not an option ID.
+  OPT_INVALID = 0, // This is not an option ID.
 #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
-#include "clang/Driver/Options.inc"
-    LastOption
+#include "clang/Options/Options.inc"
+  LastOption
 #undef OPTION
-  };
-}
+};
+} // namespace options
 
 const llvm::opt::OptTable &getDriverOptTable();
-}
-}
+} // namespace clang
 
-#endif
+#endif // LLVM_CLANG_OPTIONS_OPTIONS_H
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Options/Options.td
similarity index 100%
rename from clang/include/clang/Driver/Options.td
rename to clang/include/clang/Options/Options.td
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index c5535262ae38c..a11c8683c601e 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -146,6 +146,7 @@ module Clang_Lex {
   module * { export * }
 }
 
+module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } }
 module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } }
 module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } }
 module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } }
diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
index 4f2218b583e41..e90b009da606a 100644
--- a/clang/lib/CMakeLists.txt
+++ b/clang/lib/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(Edit)
 add_subdirectory(ExtractAPI)
 add_subdirectory(Rewrite)
 add_subdirectory(Driver)
+add_subdirectory(Options)
 add_subdirectory(Serialization)
 add_subdirectory(Frontend)
 add_subdirectory(FrontendTool)
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 7c4f70b966c48..8052659e9836b 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -19,12 +19,10 @@ add_clang_library(clangDriver
   Compilation.cpp
   Distro.cpp
   Driver.cpp
-  DriverOptions.cpp
   Job.cpp
   Multilib.cpp
   MultilibBuilder.cpp
   OffloadBundler.cpp
-  OptionUtils.cpp
   Phases.cpp
   SanitizerArgs.cpp
   Tool.cpp
@@ -99,5 +97,6 @@ add_clang_library(clangDriver
   LINK_LIBS
   clangBasic
   clangLex
+  clangOptions
   ${system_libs}
   )
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 4e300316ae9ba..f8ca2a3d09407 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -11,9 +11,9 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/Option.h"
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a0b82cec9a372..9fd64d4aac514 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -60,13 +60,13 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Types.h"
 #include "clang/Lex/DependencyDirectivesScanner.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 5dd48f53b9069..420c4cddbc8dd 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 7f70f1c444eb9..5ff7d83946137 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -21,9 +21,9 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
 
 Multilib::flags_list
 ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
-  using namespace clang::driver::options;
+  using namespace clang::options;
 
   std::vector<std::string> Result;
   const llvm::Triple Triple(ComputeEffectiveClangTriple(Args));
@@ -1805,7 +1805,7 @@ void ToolChain::TranslateXarchArgs(
   unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos));
   unsigned Prev = Index;
   std::unique_ptr<llvm::opt::Arg> XarchArg(Opts.ParseOneArg(
-      Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption)));
+      Args, Index, llvm::opt::Visibility(options::ClangOption)));
 
   // If the argument parsing failed or more than one argument was
   // consumed, the -Xarch_ argument's parameter tried to consume
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 066b59305fe3f..a8acf9cfc44c9 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -9,8 +9,8 @@
 #include "AIX.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -19,6 +19,7 @@
 #include <set>
 
 using AIX = clang::driver::toolchains::AIX;
+using namespace clang;
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
@@ -167,8 +168,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
        Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-bdbg:namedsects:ss");
 
-  if (Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) {
     StringRef BuildId = A->getValue();
     if (BuildId[0] != '0' || BuildId[1] != 'x' ||
         BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 1a243fef9532d..9dc2c6ce39ae4 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
@@ -323,27 +323,24 @@ RocmInstallationDetector::RocmInstallationDetector(
     const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib)
     : D(D) {
   Verbose = Args.hasArg(options::OPT_v);
-  RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ);
-  PrintROCmSearchDirs =
-      Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs);
+  RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ);
+  PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs);
   RocmDeviceLibPathArg =
-      Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ);
-  HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ);
-  HIPStdParPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ);
+      Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ);
+  HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ);
+  HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ);
   HasHIPStdParLibrary =
     !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg +
                                                    "/hipstdpar_lib.hpp");
   HIPRocThrustPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ);
+      Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ);
   HasRocThrustLibrary = !HIPRocThrustPathArg.empty() &&
                         D.getVFS().exists(HIPRocThrustPathArg + "/thrust");
-  HIPRocPrimPathArg =
-    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ);
+  HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ);
   HasRocPrimLibrary = !HIPRocPrimPathArg.empty() &&
                       D.getVFS().exists(HIPRocPrimPathArg + "/rocprim");
 
-  if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) {
+  if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) {
     HIPVersionArg = A->getValue();
     unsigned Major = ~0U;
     unsigned Minor = ~0U;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index e90a5736911e4..7b999c311154f 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -11,9 +11,9 @@
 
 #include "Gnu.h"
 #include "clang/Basic/TargetID.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/TargetParser/TargetParser.h"
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 2b41d54a9eb73..e14bc574d139a 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -10,8 +10,8 @@
 #include "AMDGPU.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace clang::driver;
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 731076d9754a9..588255dc5a0cd 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index e8d5e38a9064f..d6fb2a57539ed 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -9,7 +9,7 @@
 #include "AArch64.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Host.h"
@@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     // Default to 'A' profile if the architecture is not specified.
     success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions);
 
-  if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)))
+  if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
     success =
         getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features);
   else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 61beb0455147d..55eb2dcf7ddf4 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -8,7 +8,7 @@
 
 #include "ARM.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) {
 // Get Arch/CPU from args.
 void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch,
                                 llvm::StringRef &CPU, bool FromAs) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
     CPU = A->getValue();
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     Arch = A->getValue();
diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
index 2fd2c72147f5b..65f6534e4d038 100644
--- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
@@ -8,7 +8,7 @@
 
 #include "CSKY.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/CSKYTargetParser.h"
@@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args,
     return std::optional<llvm::StringRef>(A->getValue());
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue());
     if (ArchKind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
@@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     archName = A->getValue();
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue());
     if (Kind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 156ea03045569..da084bdabaee3 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -11,7 +11,7 @@
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
 
@@ -130,8 +130,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
   // Enable the `lsx` feature on 64-bit LoongArch by default.
-  if (Triple.isLoongArch64() &&
-      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
+  if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ)))
     Features.push_back("+lsx");
 
   // -mrelax is default, unless -mno-relax is specified.
diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
index 708ec84a37cfb..a620597f10475 100644
--- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
@@ -8,7 +8,7 @@
 
 #include "M68k.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Regex.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting.
 std::string m68k::getM68kTargetCPU(const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     // The canonical CPU name is captalize. However, we allow
     // starting with lower case or numbers only
     StringRef CPUName = A->getValue();
@@ -45,17 +45,17 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) {
         .Default(CPUName.str());
   }
   // FIXME: Throw error when multiple sub-architecture flag exist
-  if (Args.hasArg(clang::driver::options::OPT_m68000))
+  if (Args.hasArg(options::OPT_m68000))
     return "M68000";
-  if (Args.hasArg(clang::driver::options::OPT_m68010))
+  if (Args.hasArg(options::OPT_m68010))
     return "M68010";
-  if (Args.hasArg(clang::driver::options::OPT_m68020))
+  if (Args.hasArg(options::OPT_m68020))
     return "M68020";
-  if (Args.hasArg(clang::driver::options::OPT_m68030))
+  if (Args.hasArg(options::OPT_m68030))
     return "M68030";
-  if (Args.hasArg(clang::driver::options::OPT_m68040))
+  if (Args.hasArg(options::OPT_m68040))
     return "M68040";
-  if (Args.hasArg(clang::driver::options::OPT_m68060))
+  if (Args.hasArg(options::OPT_m68060))
     return "M68060";
 
   return "";
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 8d7b85dbeed99..103aae7018fbf 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -9,7 +9,7 @@
 #include "Mips.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 
@@ -49,8 +49,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     DefMips64CPU = "mips3";
   }
 
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ,
-                               options::OPT_mcpu_EQ))
+  if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ))
     CPUName = A->getValue();
 
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index 361a68a892a8f..44afdd249fea5 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -9,7 +9,7 @@
 #include "PPC.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index f2e79e71f93d4..1dcce6d053a39 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -10,7 +10,7 @@
 #include "../Clang.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 94a94f1e9c487..49256d80cbdf6 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -8,7 +8,7 @@
 
 #include "Sparc.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
@@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D,
 
 std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
                                      const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     StringRef CPUName = A->getValue();
     if (CPUName == "native") {
       std::string CPU = std::string(llvm::sys::getHostCPUName());
diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
index 75b6afd925245..1ef6a725483e8 100644
--- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
@@ -8,7 +8,7 @@
 
 #include "SystemZ.h"
 #include "clang/Config/config.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
     D.Diag(diag::err_drv_unsupported_opt)
       << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args);
 
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float,
-                               options::OPT_mhard_float))
-    if (A->getOption().matches(clang::driver::options::OPT_msoft_float))
+  if (Arg *A =
+          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float))
+    if (A->getOption().matches(options::OPT_msoft_float))
       ABI = systemz::FloatABI::Soft;
 
   return ABI;
@@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
 
 std::string systemz::getSystemZTargetCPU(const ArgList &Args,
                                          const llvm::Triple &T) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     llvm::StringRef CPUName = A->getValue();
 
     if (CPUName == "native") {
diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp
index adc0873586588..c8353d7dc5f3a 100644
--- a/clang/lib/Driver/ToolChains/Arch/VE.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp
@@ -8,7 +8,7 @@
 
 #include "VE.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang::driver;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 1373905a5120e..092069b6ade56 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -8,7 +8,7 @@
 
 #include "X86.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Option/ArgList.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
                                  const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     StringRef CPU = A->getValue();
     if (CPU != "native")
       return std::string(CPU);
@@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                std::vector<StringRef> &Features) {
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
     StringRef DefaultAbi =
         (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv";
     if (A->getValue() != DefaultAbi)
@@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   }
 
   // If -march=native, autodetect the feature list.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
     if (StringRef(A->getValue()) == "native") {
       for (auto &F : llvm::sys::getHostCPUFeatures())
         Features.push_back(
@@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   // flags). This is a bit hacky but keeps existing usages working. We should
   // consider deprecating this and instead warn if the user requests external
   // retpoline thunks and *doesn't* request some form of retpolines.
-  auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
+  auto SpectreOpt = options::ID::OPT_INVALID;
   if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
                          options::OPT_mspeculative_load_hardening,
                          options::OPT_mno_speculative_load_hardening)) {
@@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     SpectreOpt = options::OPT_mretpoline_external_thunk;
   }
 
-  auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
+  auto LVIOpt = options::ID::OPT_INVALID;
   if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
                    false)) {
     Features.push_back("+lvi-load-hardening");
@@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
           << D.getOpts().getOptionName(options::OPT_mlvi_hardening)
           << D.getOpts().getOptionName(options::OPT_m_seses);
 
-    if (SpectreOpt != clang::driver::options::ID::OPT_INVALID)
+    if (SpectreOpt != options::ID::OPT_INVALID)
       D.Diag(diag::err_drv_argument_not_allowed_with)
           << D.getOpts().getOptionName(SpectreOpt)
           << D.getOpts().getOptionName(options::OPT_m_seses);
@@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     }
   }
 
-  if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
-      LVIOpt != clang::driver::options::ID::OPT_INVALID) {
+  if (SpectreOpt != options::ID::OPT_INVALID &&
+      LVIOpt != options::ID::OPT_INVALID) {
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << D.getOpts().getOptionName(SpectreOpt)
         << D.getOpts().getOptionName(LVIOpt);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 9b7f58c392885..8d598be9ffb0a 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -18,7 +18,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
@@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
 bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
                                     const llvm::opt::ArgList &Args) {
   if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+      Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) {
     GCCInstallation.init(Triple, Args);
     return GCCInstallation.isValid();
   }
diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
index e4db3307ee3aa..c561d7d38da5b 100644
--- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
+++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e20963ac288d8..2791b1e57877e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -29,10 +29,10 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -65,7 +65,7 @@ using namespace clang;
 using namespace llvm::opt;
 
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
+  if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC,
                                options::OPT_fminimize_whitespace,
                                options::OPT_fno_minimize_whitespace,
                                options::OPT_fkeep_system_includes,
@@ -1661,7 +1661,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
   AddAAPCSVolatileBitfieldArgs(Args, CmdArgs);
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     CmdArgs.push_back("-tune-cpu");
     if (strcmp(A->getValue(), "native") == 0)
       CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName()));
@@ -2067,7 +2067,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args,
     CmdArgs.push_back("hard");
   }
 
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
     std::string TuneCPU;
     if (Name == "native")
@@ -2173,12 +2173,11 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
 
   // Default to "generic" unless -march is present or targetting the PS4/PS5.
   std::string TuneCPU;
-  if (!Args.hasArg(clang::driver::options::OPT_march_EQ) &&
-      !getToolChain().getTriple().isPS())
+  if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS())
     TuneCPU = "generic";
 
   // Override based on -mtune.
-  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
 
     if (Name == "native") {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index ec8dcdc81db56..9e3ca9f281195 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -31,11 +31,11 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
 #include "clang/Driver/XRayArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -69,8 +69,7 @@ using namespace llvm::opt;
 
 static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
                                               const llvm::Triple &Triple) {
-  if (Args.hasArg(clang::driver::options::OPT_pg) &&
-      !Args.hasArg(clang::driver::options::OPT_mfentry))
+  if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
     return true;
 
   if (Triple.isAndroid())
@@ -249,17 +248,16 @@ getFramePointerKind(const llvm::opt::ArgList &Args,
   // without requiring new frame records to be created.
 
   bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple);
-  bool EnableFP =
-      mustUseNonLeafFramePointerForTarget(Triple) ||
-      Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer,
-                   clang::driver::options::OPT_fomit_frame_pointer, DefaultFP);
+  bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) ||
+                  Args.hasFlag(options::OPT_fno_omit_frame_pointer,
+                               options::OPT_fomit_frame_pointer, DefaultFP);
 
   bool DefaultLeafFP =
       useLeafFramePointerForTargetByDefault(Triple) ||
       (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple));
-  bool EnableLeafFP = Args.hasFlag(
-      clang::driver::options::OPT_mno_omit_leaf_frame_pointer,
-      clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
+  bool EnableLeafFP =
+      Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer,
+                   options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
 
   bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple);
 
@@ -753,7 +751,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
-    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
+    if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
       return std::string(
           llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
     return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
@@ -1733,7 +1731,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
     if (SanArgs.needsFuzzerInterceptors())
       addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false,
                           true);
-    if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) {
+    if (!Args.hasArg(options::OPT_nostdlibxx)) {
       bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                  !Args.hasArg(options::OPT_static);
       if (OnlyLibstdcxxStatic)
@@ -3385,7 +3383,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args,
 // Otherwise, return an empty string and issue a diagnosic message if needed.
 StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
                                                const llvm::opt::ArgList &Args) {
-  Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ);
+  Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
   if (!A)
     return "";
 
diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp
index 51c892fc91718..6df5315e8fff8 100644
--- a/clang/lib/Driver/ToolChains/CrossWindows.cpp
+++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp
@@ -10,8 +10,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 07201cc4676ac..6cc73ff5fc1f6 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -14,7 +14,7 @@
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
@@ -153,16 +153,16 @@ CudaInstallationDetector::CudaInstallationDetector(
   std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
   auto &FS = D.getVFS();
 
-  if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
+  if (Args.hasArg(options::OPT_cuda_path_EQ)) {
     Candidates.emplace_back(
-        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
+        Args.getLastArgValue(options::OPT_cuda_path_EQ).str());
   } else if (HostTriple.isOSWindows()) {
     for (const char *Ver : Versions)
       Candidates.emplace_back(
           D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
           Ver);
   } else {
-    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
+    if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) {
       // Try to find ptxas binary. If the executable is located in a directory
       // called 'bin/', its parent directory might be a good guess for a valid
       // CUDA installation.
diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp
index d9c16347daa34..55438125ce0f1 100644
--- a/clang/lib/Driver/ToolChains/Cygwin.cpp
+++ b/clang/lib/Driver/ToolChains/Cygwin.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 2fb7652d64536..fc3cd9030f71d 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const {
 
   case llvm::Triple::thumb:
   case llvm::Triple::arm:
-    if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ))
+    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
       if (const char *Arch = ArmMachOArchName(A->getValue()))
         return Arch;
 
@@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args,
   if (!BoundArch.empty()) {
     StringRef Name = BoundArch;
     const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ);
-    const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ);
+    const Option MArch = Opts.getOption(options::OPT_march_EQ);
 
     // This code must be kept in sync with LLVM's getArchTypeForDarwinArch,
     // which defines the list of which architectures we accept.
diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index 524f5f2ff391e..d4a6d6ae3e349 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 88bce181d40d2..038395e4b68f2 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -11,7 +11,7 @@
 
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/CommonArgs.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/Host.h"
@@ -230,7 +230,7 @@ void Flang::addCodegenOptions(const ArgList &Args,
        options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays,
        options::OPT_ftime_report, options::OPT_ftime_report_EQ,
        options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
-  if (Args.hasArg(clang::driver::options::OPT_fcoarray))
+  if (Args.hasArg(options::OPT_fcoarray))
     CmdArgs.push_back("-fcoarray");
 }
 
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index b17b76233ad30..70e66a2f5c3e7 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 507cc03b27513..9edfc4de3d602 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const {
 
 ToolChain::RuntimeLibType
 Fuchsia::GetRuntimeLibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 2dd8cc8422cd7..1bfcd1f4f3a7c 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -20,9 +20,9 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/ArgList.h"
@@ -2058,7 +2058,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) {
 
 static llvm::StringRef getGCCToolchainDir(const ArgList &Args,
                                           llvm::StringRef SysRoot) {
-  const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain);
+  const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain);
   if (A)
     return A->getValue();
 
@@ -2111,8 +2111,7 @@ void Generic_GCC::GCCInstallationDetector::init(
                            CandidateBiarchTripleAliases);
 
   // If --gcc-install-dir= is specified, skip filesystem detection.
-  if (const Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ);
+  if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ);
       A && A->getValue()[0]) {
     StringRef InstallDir = A->getValue();
     if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) {
@@ -2135,8 +2134,7 @@ void Generic_GCC::GCCInstallationDetector::init(
 
   // If --gcc-triple is specified use this instead of trying to
   // auto-detect a triple.
-  if (const Arg *A =
-          Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) {
+  if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) {
     StringRef GCCTriple = A->getValue();
     CandidateTripleAliases.clear();
     CandidateTripleAliases.push_back(GCCTriple);
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index c0c8afec07264..0fbfa090ed9d3 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -15,8 +15,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/TargetParser.h"
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index bce7f46dea468..be0f49d8e1497 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -211,7 +211,7 @@ HIPSPVToolChain::getDeviceLibs(
   // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH.
   auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues(
       // --hip-device-lib-path is alias to this option.
-      clang::driver::options::OPT_rocm_device_lib_path_EQ);
+      options::OPT_rocm_device_lib_path_EQ);
   for (auto Path : HipDeviceLibPathArgs)
     LibraryPaths.push_back(DriverArgs.MakeArgString(Path));
 
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 732403e69a075..1af2ae6470f1e 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -9,7 +9,7 @@
 #include "HIPUtility.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Archive.h"
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 9f8b676fc7dc2..084f51721315c 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -11,7 +11,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp
index 43121233ea7d0..53ee4d4c0cbde 100644
--- a/clang/lib/Driver/ToolChains/Hurd.cpp
+++ b/clang/lib/Driver/ToolChains/Hurd.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 94a9fe8b1a63f..020e7465548fe 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -16,8 +16,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Path.h"
@@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   // Add 'include' in the resource directory, which is similar to
diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
index 9eca1ad5f2865..3cc56bb7e832e 100644
--- a/clang/lib/Driver/ToolChains/MSP430.cpp
+++ b/clang/lib/Driver/ToolChains/MSP430.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Multilib.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index bb469ff095cd4..fcae5b7a18f34 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/ConvertUTF.h"
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
index da4a9072317f4..1bbabdfc631b8 100644
--- a/clang/lib/Driver/ToolChains/Managarm.cpp
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -11,8 +11,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 1bb9bcfe6aab2..2c9a174069f70 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp
index 7dd3936613296..58d6b5031f536 100644
--- a/clang/lib/Driver/ToolChains/MipsLinux.cpp
+++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp
@@ -9,7 +9,7 @@
 #include "MipsLinux.h"
 #include "Arch/Mips.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D,
 
 void MipsLLVMToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 8db00deeb80df..ea722b59853d6 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 00991504e97a8..607eb714f85dc 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
@@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
 
 ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
     const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 8f589186af343..5e7b4f1a664e6 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
index 8d381c4f14371..76180431ee682 100644
--- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCFreeBSD.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang::driver::toolchains;
@@ -16,7 +16,7 @@ using namespace llvm::opt;
 
 void PPCFreeBSDToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp
index 768214e416bd7..672ebd5b7b98d 100644
--- a/clang/lib/Driver/ToolChains/PPCLinux.cpp
+++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCLinux.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D,
 
 void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                   ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 34ec65ae59602..6fe18aa4cceba 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp
index ea824dbad54cb..27de55cfebfc1 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.cpp
+++ b/clang/lib/Driver/ToolChains/SPIRV.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 0232b047a6c4b..85859f344b491 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector(
 
 void SYCLInstallationDetector::addSYCLIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc))
+  if (DriverArgs.hasArg(options::OPT_nobuiltininc))
     return;
 
   // Add the SYCL header search locations in the specified order.
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 64c7d1ceb3a36..ad0f41144f393 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -13,9 +13,9 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp
index d2be147c7b9f6..7732e37f8061d 100644
--- a/clang/lib/Driver/ToolChains/UEFI.cpp
+++ b/clang/lib/Driver/ToolChains/UEFI.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp
index ad9129046c3e1..78509bcdae0fe 100644
--- a/clang/lib/Driver/ToolChains/VEToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include <cstdlib> // ::getenv
@@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; }
 
 void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   if (DriverArgs.hasArg(options::OPT_nobuiltininc) &&
@@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 5054868b5ff4d..15c6f19e87fee 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; }
 void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args,
                                         Action::OffloadKind) const {
-  if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array,
+  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
                           options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fno-use-init-array");
 
@@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const {
 
 void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp
index 6a2a75cb99739..dd26c11affffb 100644
--- a/clang/lib/Driver/ToolChains/XCore.cpp
+++ b/clang/lib/Driver/ToolChains/XCore.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include <cstdlib> // ::getenv
 
@@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; }
 
 void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
   if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) {
@@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void XCoreToolChain::AddClangCXXStdlibIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index 9a3c45323a3cf..eac8f623f9a50 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -9,7 +9,7 @@
 #include "ZOS.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 0325296f84b19..4c2d11751a363 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/XRayArgs.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/SpecialCaseList.h"
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index a916667208845..dac9e0d26f393 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -52,6 +52,7 @@ add_clang_library(clangFrontend
   clangAST
   clangBasic
   clangDriver
+  clangOptions
   clangEdit
   clangLex
   clangParse
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index be7c1d367e082..f584a2a5824b2 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -27,7 +27,6 @@
 #include "clang/Basic/XRayInstr.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CommandLineSourceLoc.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -38,6 +37,7 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
@@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() {
 using ArgumentConsumer = CompilerInvocation::ArgumentConsumer;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static llvm::StringRef lookupStrInTable(unsigned Offset) {
@@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) {
 }
 
 #define SIMPLE_ENUM_VALUE_TABLE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef SIMPLE_ENUM_VALUE_TABLE
 
 static std::optional<bool> normalizeSimpleFlag(OptSpecifier Opt,
@@ -981,7 +981,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) {
@@ -1068,7 +1068,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) {
@@ -1575,7 +1575,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   if (Opts.OptimizationLevel > 0) {
@@ -1880,7 +1880,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   // At O0 we want to fully disable inlining outside of cases marked with
@@ -2371,7 +2371,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts,
   const DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Opts.ShowIncludesDest != ShowIncludesDestination::None)
@@ -2406,7 +2406,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts,
   DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Args.hasArg(OPT_show_includes)) {
@@ -2534,7 +2534,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 }
 
@@ -2546,7 +2546,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2557,7 +2557,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts,
   const MigratorOptions &MigratorOpts = Opts;
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 }
 
@@ -2569,7 +2569,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args,
 
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2581,7 +2581,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs(
   const DiagnosticOptions *DiagnosticOpts = &Opts;
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   if (!Opts.DiagnosticSerializationFile.empty())
@@ -2686,7 +2686,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
 
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes);
@@ -2836,7 +2836,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts,
   const FrontendOptions &FrontendOpts = Opts;
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   std::optional<OptSpecifier> ProgramActionOpt =
@@ -3006,7 +3006,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
 
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   Opts.ProgramAction = frontend::ParseSyntaxOnly;
@@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
   const HeaderSearchOptions *HeaderSearchOpts = &Opts;
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (Opts.UseLibcxx)
@@ -3403,7 +3403,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
 
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ))
@@ -3736,7 +3736,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   // The '-fcf-protection=' option is generated by CodeGenOpts generator.
@@ -4082,7 +4082,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
@@ -4745,7 +4745,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate)
@@ -4819,7 +4819,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) ||
@@ -4912,7 +4912,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP;
@@ -4933,7 +4933,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM);
@@ -4948,7 +4948,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts,
   const TargetOptions *TargetOpts = &Opts;
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (!Opts.SDKVersion.empty())
@@ -4967,7 +4967,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
 
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) {
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 99212b81fe064..73b81ed906808 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -14,11 +14,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Option/ArgList.h"
@@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef<const char *> ArgList,
   if (!C)
     return nullptr;
 
-  if (C->getArgs().hasArg(driver::options::OPT_fdriver_only))
+  if (C->getArgs().hasArg(options::OPT_fdriver_only))
     return nullptr;
 
   // Just print the cc1 options if -### was present.
-  if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) {
+  if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) {
     C->getJobs().Print(llvm::errs(), "\n", true);
     return nullptr;
   }
diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt
index 061e54c3e62d0..66213f76eb968 100644
--- a/clang/lib/FrontendTool/CMakeLists.txt
+++ b/clang/lib/FrontendTool/CMakeLists.txt
@@ -7,6 +7,7 @@ set(link_libs
   clangBasic
   clangCodeGen
   clangDriver
+  clangOptions
   clangExtractAPI
   clangFrontend
   clangRewriteFrontend
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index c8aad4daa1c10..e571193c6a9c8 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -13,7 +13,6 @@
 
 #include "clang/CodeGen/CodeGenAction.h"
 #include "clang/Config/config.h"
-#include "clang/Driver/Options.h"
 #include "clang/ExtractAPI/FrontendActions.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -22,6 +21,7 @@
 #include "clang/Frontend/FrontendPluginRegistry.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
+#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
 
   // Honor -help.
   if (Clang->getFrontendOpts().ShowHelp) {
-    driver::getDriverOptTable().printHelp(
+    getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1 [options] file...",
         "LLVM 'Clang' Compiler: http://clang.llvm.org",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(driver::options::CC1Option));
+        llvm::opt::Visibility(options::CC1Option));
     return true;
   }
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 76338065d2231..7764fa7dc92b9 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -33,7 +33,6 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
@@ -43,6 +42,7 @@
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT,
   llvm::ArrayRef<const char *> RF = llvm::ArrayRef(ClangArgv);
   std::unique_ptr<driver::Compilation> Compilation(Driver.BuildCompilation(RF));
 
-  if (Compilation->getArgs().hasArg(driver::options::OPT_v))
+  if (Compilation->getArgs().hasArg(options::OPT_v))
     Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false);
 
   auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get());
diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
index fbf9814b0d4a7..4efe8b9fbc6cc 100644
--- a/clang/lib/Interpreter/InterpreterUtils.h
+++ b/clang/lib/Interpreter/InterpreterUtils.h
@@ -21,11 +21,11 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 
 #include "clang/Sema/Lookup.h"
 #include "llvm/IR/Module.h"
diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt
new file mode 100644
index 0000000000000..a762e9918b41c
--- /dev/null
+++ b/clang/lib/Options/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_LINK_COMPONENTS
+  Option
+  Support
+)
+
+add_clang_library(clangOptions
+  DriverOptions.cpp
+  OptionUtils.cpp
+  
+  DEPENDS
+  ClangDriverOptions
+  # These generated headers are included transitively.
+  target_parser_gen
+
+  LINK_LIBS
+  clangBasic
+  ${system_libs}
+)
diff --git a/clang/lib/Driver/DriverOptions.cpp b/clang/lib/Options/DriverOptions.cpp
similarity index 76%
rename from clang/lib/Driver/DriverOptions.cpp
rename to clang/lib/Options/DriverOptions.cpp
index cde1f8989935b..d91e9291fb2f6 100644
--- a/clang/lib/Driver/DriverOptions.cpp
+++ b/clang/lib/Options/DriverOptions.cpp
@@ -6,33 +6,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/OptTable.h"
 #include <cassert>
 
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm::opt;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 #define OPTTABLE_VALUES_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_VALUES_CODE
 
 #define OPTTABLE_PREFIXES_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_PREFIXES_TABLE_CODE
 
 #define OPTTABLE_PREFIXES_UNION_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_PREFIXES_UNION_CODE
 
 static constexpr OptTable::Info InfoTable[] = {
 #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
 };
 
@@ -44,9 +43,9 @@ class DriverOptTable : public PrecomputedOptTable {
       : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                             OptionPrefixesUnion) {}
 };
-}
+} // anonymous namespace
 
-const llvm::opt::OptTable &clang::driver::getDriverOptTable() {
+const llvm::opt::OptTable &clang::getDriverOptTable() {
   static DriverOptTable Table;
   return Table;
 }
diff --git a/clang/lib/Driver/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp
similarity index 97%
rename from clang/lib/Driver/OptionUtils.cpp
rename to clang/lib/Options/OptionUtils.cpp
index 1f36ffc03cab3..fcafd3c83c6b3 100644
--- a/clang/lib/Driver/OptionUtils.cpp
+++ b/clang/lib/Options/OptionUtils.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Options/OptionUtils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticDriver.h"
-#include "clang/Driver/OptionUtils.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang;
diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt
index fc1f1f9f9d367..faaa53276d0e6 100644
--- a/clang/lib/Tooling/CMakeLists.txt
+++ b/clang/lib/Tooling/CMakeLists.txt
@@ -40,6 +40,7 @@ add_clang_library(clangTooling
   clangASTMatchers
   clangBasic
   clangDriver
+  clangOptions
   clangFormat
   clangFrontend
   clangLex
diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
index 28568426a6c48..e9b72388ae4df 100644
--- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
@@ -44,8 +44,8 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Types.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -164,11 +164,11 @@ struct TransferableCommand {
     // We parse each argument individually so that we can retain the exact
     // spelling of each argument; re-rendering is lossy for aliased flags.
     // E.g. in CL mode, /W4 maps to -Wall.
-    auto &OptTable = clang::driver::getDriverOptTable();
+    auto &OptTable = getDriverOptTable();
     if (!OldArgs.empty())
       Cmd.CommandLine.emplace_back(OldArgs.front());
     for (unsigned Pos = 1; Pos < OldArgs.size();) {
-      using namespace driver::options;
+      using namespace options;
 
       const unsigned OldPos = Pos;
       std::unique_ptr<llvm::opt::Arg> Arg(OptTable.ParseOneArg(
@@ -296,14 +296,14 @@ struct TransferableCommand {
   // Try to interpret the argument as a type specifier, e.g. '-x'.
   std::optional<types::ID> tryParseTypeArg(const llvm::opt::Arg &Arg) {
     const llvm::opt::Option &Opt = Arg.getOption();
-    using namespace driver::options;
+    using namespace options;
     if (ClangCLMode) {
       if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc))
         return types::TY_C;
       if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp))
         return types::TY_CXX;
     } else {
-      if (Opt.matches(driver::options::OPT_x))
+      if (Opt.matches(options::OPT_x))
         return types::lookupTypeForTypeSpecifier(Arg.getValue());
     }
     return std::nullopt;
@@ -311,7 +311,7 @@ struct TransferableCommand {
 
   // Try to interpret the argument as '-std='.
   std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
-    using namespace driver::options;
+    using namespace options;
     if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
       // "c++latest" is not a recognized LangStandard, but it's accepted by
       // the clang driver in CL mode.
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index e8eef5ed9c9fa..1f6a5c94601fc 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -21,7 +21,6 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ASTUnit.h"
@@ -32,6 +31,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Options/Options.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -270,17 +270,15 @@ void addTargetAndModeForProgramName(std::vector<std::string> &CommandLine,
                                     StringRef InvokedAs) {
   if (CommandLine.empty() || InvokedAs.empty())
     return;
-  const auto &Table = driver::getDriverOptTable();
+  const auto &Table = getDriverOptTable();
   // --target=X
-  StringRef TargetOPT =
-      Table.getOption(driver::options::OPT_target).getPrefixedName();
+  StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName();
   // -target X
   StringRef TargetOPTLegacy =
-      Table.getOption(driver::options::OPT_target_legacy_spelling)
-          .getPrefixedName();
+      Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName();
   // --driver-mode=X
   StringRef DriverModeOPT =
-      Table.getOption(driver::options::OPT_driver_mode).getPrefixedName();
+      Table.getOption(options::OPT_driver_mode).getPrefixedName();
   auto TargetMode =
       driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs);
   // No need to search for target args if we don't have a target/mode to insert.
diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt
index 5493aa4237aee..1efcc91fcaec9 100644
--- a/clang/tools/clang-check/CMakeLists.txt
+++ b/clang/tools/clang-check/CMakeLists.txt
@@ -14,6 +14,7 @@ clang_target_link_libraries(clang-check
   clangBasic
   clangDriver
   clangFrontend
+  clangOptions
   clangRewriteFrontend
   clangSerialization
   clangStaticAnalyzerFrontend
diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp
index fa6dd06a1ee58..80255c647b98f 100644
--- a/clang/tools/clang-check/ClangCheck.cpp
+++ b/clang/tools/clang-check/ClangCheck.cpp
@@ -16,9 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
+#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FixItRewriter.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -34,8 +34,8 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 
-using namespace clang::driver;
 using namespace clang::tooling;
+using namespace clang;
 using namespace llvm;
 
 static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index 9c0d9dff7dc7f..54bc80486472f 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -25,6 +25,7 @@ clang_target_link_libraries(clang-installapi
   clangAST
   clangInstallAPI
   clangBasic
+  clangOptions
   clangDriver
   clangFrontend
   clangTooling
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 4e66485343b89..8bef9690ad855 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -35,7 +35,7 @@
 
 using namespace clang;
 using namespace clang::installapi;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
@@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose,
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   DiagnosticOptions DiagOpts;
-  const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable();
+  const llvm::opt::OptTable &ClangOpts = getDriverOptTable();
   unsigned MissingArgIndex, MissingArgCount;
   llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs(
       ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 64324a3f8b010..f484d6f33ad8f 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -26,8 +26,6 @@ using namespace llvm;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
-namespace drv = clang::driver::options;
-
 namespace clang {
 namespace installapi {
 
@@ -109,7 +107,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table,
 
 bool Options::processDriverOptions(InputArgList &Args) {
   // Handle inputs.
-  for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) {
+  for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) {
     // Assume any input that is not a directory is a filelist.
     // InstallAPI does not accept multiple directories, so retain the last one.
     if (FM->getOptionalDirectoryRef(Path))
@@ -120,7 +118,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Handle output.
   SmallString<PATH_MAX> OutputPath;
-  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
+  if (auto *Arg = Args.getLastArg(options::OPT_o)) {
     OutputPath = Arg->getValue();
     if (OutputPath != "-")
       FM->makeAbsolutePath(OutputPath);
@@ -132,10 +130,10 @@ bool Options::processDriverOptions(InputArgList &Args) {
   }
 
   // Do basic error checking first for mixing -target and -arch options.
-  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
-  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
+  auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch);
+  auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target);
   auto *ArgTargetVariant =
-      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant);
+      Args.getLastArgNoClaim(options::OPT_darwin_target_variant);
   if (ArgArch && (ArgTarget || ArgTargetVariant)) {
     Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
         << ArgArch->getAsString(Args)
@@ -143,7 +141,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     return false;
   }
 
-  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
+  auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ);
   if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
     Diags->Report(clang::diag::err_drv_cannot_mix_options)
         << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
@@ -152,7 +150,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target triples first.
   if (ArgTarget) {
-    for (const Arg *A : Args.filtered(drv::OPT_target)) {
+    for (const Arg *A : Args.filtered(options::OPT_target)) {
       A->claim();
       llvm::Triple TargetTriple(A->getValue());
       Target TAPITarget = Target(TargetTriple);
@@ -168,7 +166,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target variants.
   DriverOpts.Zippered = ArgTargetVariant != nullptr;
-  for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) {
+  for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) {
     A->claim();
     Triple Variant(A->getValue());
     if (Variant.getVendor() != Triple::Apple) {
@@ -213,7 +211,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     DriverOpts.Targets[TAPIVariant] = Variant;
   }
 
-  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
+  DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v);
 
   return true;
 }
@@ -407,7 +405,7 @@ bool Options::processOptionList(InputArgList &Args,
 
 bool Options::processLinkerOptions(InputArgList &Args) {
   // Handle required arguments.
-  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
+  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
     LinkerOpts.InstallName = A->getValue();
   if (LinkerOpts.InstallName.empty()) {
     Diags->Report(diag::err_no_install_name);
@@ -415,28 +413,29 @@ bool Options::processLinkerOptions(InputArgList &Args) {
   }
 
   // Defaulted or optional arguments.
-  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_current__version))
     LinkerOpts.CurrentVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(drv::OPT_umbrella))
+  if (auto *Arg = Args.getLastArg(options::OPT_umbrella))
     LinkerOpts.ParentUmbrella = Arg->getValue();
 
-  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
+  LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib);
 
-  for (auto *Arg : Args.filtered(drv::OPT_alias_list)) {
+  for (auto *Arg : Args.filtered(options::OPT_alias_list)) {
     LinkerOpts.AliasLists.emplace_back(Arg->getValue());
     Arg->claim();
   }
 
-  LinkerOpts.AppExtensionSafe = Args.hasFlag(
-      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
-      /*Default=*/LinkerOpts.AppExtensionSafe);
+  LinkerOpts.AppExtensionSafe =
+      Args.hasFlag(options::OPT_fapplication_extension,
+                   options::OPT_fno_application_extension,
+                   /*Default=*/LinkerOpts.AppExtensionSafe);
 
   if (::getenv("LD_NO_ENCRYPT") != nullptr)
     LinkerOpts.AppExtensionSafe = true;
@@ -446,7 +445,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 
   // Capture library paths.
   PathSeq LibraryPaths;
-  for (const Arg *A : Args.filtered(drv::OPT_L)) {
+  for (const Arg *A : Args.filtered(options::OPT_L)) {
     LibraryPaths.emplace_back(A->getValue());
     A->claim();
   }
@@ -461,7 +460,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 // invocations.
 bool Options::processFrontendOptions(InputArgList &Args) {
   // Capture language mode.
-  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
+  if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) {
     FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
                           .Case("c", clang::Language::C)
                           .Case("c++", clang::Language::CXX)
@@ -475,15 +474,15 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       return false;
     }
   }
-  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
-    if (A->getOption().matches(drv::OPT_ObjC))
+  for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) {
+    if (A->getOption().matches(options::OPT_ObjC))
       FEOpts.LangMode = clang::Language::ObjC;
     else
       FEOpts.LangMode = clang::Language::ObjCXX;
   }
 
   // Capture Sysroot.
-  if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) {
+  if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) {
     SmallString<PATH_MAX> Path(A->getValue());
     FM->makeAbsolutePath(Path);
     if (!FM->getOptionalDirectoryRef(Path)) {
@@ -502,13 +501,13 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   }
 
   // Capture system frameworks for all platforms.
-  for (const Arg *A : Args.filtered(drv::OPT_iframework))
+  for (const Arg *A : Args.filtered(options::OPT_iframework))
     FEOpts.SystemFwkPaths.emplace_back(A->getValue(),
                                        std::optional<PlatformType>{});
 
   // Capture framework paths.
   PathSeq FrameworkPaths;
-  for (const Arg *A : Args.filtered(drv::OPT_F))
+  for (const Arg *A : Args.filtered(options::OPT_F))
     FrameworkPaths.emplace_back(A->getValue());
 
   if (!FrameworkPaths.empty())
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index d9d36f7a41359..002aaef005253 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -63,6 +63,7 @@ clang_target_link_libraries(clang
   clangDriver
   clangFrontend
   clangFrontendTool
+  clangOptions
   clangSerialization
   )
 
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 52cffa4ccbe1f..2aef75597fc5f 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -17,7 +17,6 @@
 #include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -25,6 +24,7 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
+#include "clang/Options/Options.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 50da2f8449a22..f13812f2a8383 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -14,10 +14,10 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -59,8 +59,7 @@
 #include <optional>
 #include <system_error>
 using namespace clang;
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace clang::options;
 using namespace llvm;
 using namespace llvm::opt;
 
@@ -688,8 +687,7 @@ int cc1as_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1as [options] file...",
         "Clang Integrated Assembler", /*ShowHidden=*/false,
-        /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(driver::options::CC1AsOption));
+        /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption));
 
     return 0;
   }
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 7390d7d610ec0..1e2c9884ba63d 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -18,13 +18,13 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ChainedDiagnosticConsumer.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/SerializedDiagnosticPrinter.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index 62274235c53f5..e0454f190b35a 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)};
   EXPECT_NE(TranslatedArgs, nullptr);
   if (TranslatedArgs) {
-    auto *A = TranslatedArgs->getLastArg(
-        clang::driver::options::OPT_dxil_validator_version);
+    auto *A =
+        TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version);
     EXPECT_NE(A, nullptr);
     if (A) {
       EXPECT_STREQ(A->getValue(), "1.1");
diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html
index ae0ec1d4d12cb..3e5e84b5b2ed4 100755
--- a/clang/www/OpenProjects.html
+++ b/clang/www/OpenProjects.html
@@ -38,7 +38,7 @@ <h1>Open Clang Projects</h1>
   <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Basic/DiagnosticDocs.td">
       diagnostic group flags</a> (adding code examples of what is diagnosed, or
       other relevant information), or</li>
-  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Driver/Options.td">
+  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Options/Options.td">
       command line options</a>, or</li>
   <li>help with completing other missing documentation.</li>
 </ul>
diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt
index 568f942cb4aa6..b183d6add1059 100644
--- a/flang/docs/CMakeLists.txt
+++ b/flang/docs/CMakeLists.txt
@@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target)
   endif()
   get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY)
   list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}")
-  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/")
+  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/")
   clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target})
 endfunction()
 
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 3286171bb1499..9953f2252218b 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -76,7 +76,7 @@ will ignore it when used without `Xflang`.
 As hinted above, `flang` and `flang -fc1` are two separate tools. The
 fact that these tools are accessed through one binary, `flang`, is just an
 implementation detail. Each tool has a separate list of options, albeit defined
-in the same file: `clang/include/clang/Driver/Options.td`.
+in the same file: `clang/include/clang/Options/Options.td`.
 
 The separation helps us split various tasks and allows us to implement more
 specialised tools. In particular, `flang` is not aware of various
@@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to:
   as linkers and assemblers.
 One implication of this dependency on Clang is that all of Flang's compiler
 options are defined alongside Clang's options in
-`clang/include/clang/Driver/Options.td`. For options that are common for both
+`clang/include/clang/Options/Options.td`. For options that are common for both
 Flang and Clang, the corresponding definitions are shared.
 
 Internally, a `clangDriver` based compiler driver works by creating actions
@@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps:
 
 ### Option Definition
 All of Flang's compiler and frontend driver options are defined in
-`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to
+`clang/include/clang/Options/Options.td` in Clang. When adding a new option to
 Flang, you will either:
   * extend the existing definition for an option that is already available
     in one of Clang's drivers (e.g.  `clang`), but not yet available in Flang, or
@@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g.
 `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in
 `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.:
 ```cpp
-    case clang::driver::options::OPT_fsyntax_only:
+    case clang::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
 ```
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index 2b3bc0e9c2269..bb0b4a39cec9b 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -76,6 +76,7 @@ add_flang_library(flangFrontend
   CLANG_LIBS
   clangBasic
   clangDriver
+  clangOptions
 )
 
 target_precompile_headers(flangFrontend PRIVATE
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index f05c4cfccf7fc..a9bb78d690f46 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -26,8 +26,8 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/OptionUtils.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/OptionUtils.h"
+#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args,
 
   for (auto *a : args) {
     const llvm::opt::Option &opt = a->getOption();
-    if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) {
+    if (opt.matches(clang::options::OPT_fcolor_diagnostics)) {
       showColors = Colors_On;
-    } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) {
+    } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) {
       showColors = Colors_Off;
-    } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) {
+    } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) {
       llvm::StringRef value(a->getValue());
       if (value == "always")
         showColors = Colors_On;
@@ -107,15 +107,13 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   unsigned defaultOpt = 0;
 
-  if (llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_O_Group)) {
-    if (a->getOption().matches(clang::driver::options::OPT_O0))
+  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) {
+    if (a->getOption().matches(clang::options::OPT_O0))
       return 0;
 
-    assert(a->getOption().matches(clang::driver::options::OPT_O));
+    assert(a->getOption().matches(clang::options::OPT_O));
 
-    return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt,
-                              diags);
+    return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags);
   }
 
   return defaultOpt;
@@ -133,7 +131,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
                            clang::DiagnosticsEngine &diags) {
   using DebugInfoKind = llvm::codegenoptions::DebugInfoKind;
   if (llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) {
+          args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) {
     std::optional<DebugInfoKind> val =
         llvm::StringSwitch<std::optional<DebugInfoKind>>(arg->getValue())
             .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly)
@@ -158,13 +156,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
       diags.Report(debugWarning) << arg->getValue();
     }
     opts.DwarfVersion =
-        getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ,
+        getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ,
                            /*Default=*/0, diags);
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::driver::options::OPT_split_dwarf_file))
+            args.getLastArg(clang::options::OPT_split_dwarf_file))
       opts.SplitDwarfFile = a->getValue();
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::driver::options::OPT_split_dwarf_output))
+            args.getLastArg(clang::options::OPT_split_dwarf_output))
       opts.SplitDwarfOutput = a->getValue();
   }
   return true;
@@ -174,7 +172,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
                                      llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg =
-      args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ);
+      args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ);
   if (!arg)
     return;
 
@@ -199,7 +197,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
 static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts,
                               llvm::opt::ArgList &args,
                               clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib);
+  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib);
   if (!arg)
     return true;
 
@@ -237,7 +235,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags,
   CodeGenOptions::OptRemark result;
 
   for (llvm::opt::Arg *a : args) {
-    if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) {
+    if (a->getOption().matches(clang::options::OPT_R_Joined)) {
       llvm::StringRef value = a->getValue();
 
       if (value == remarkOptName) {
@@ -274,39 +272,39 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                              clang::DiagnosticsEngine &diags) {
   opts.OptimizationLevel = getOptimizationLevel(args, diags);
 
-  if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager,
-                   clang::driver::options::OPT_fno_debug_pass_manager, false))
+  if (args.hasFlag(clang::options::OPT_fdebug_pass_manager,
+                   clang::options::OPT_fno_debug_pass_manager, false))
     opts.DebugPassManager = 1;
 
-  if (args.hasFlag(clang::driver::options::OPT_fstack_arrays,
-                   clang::driver::options::OPT_fno_stack_arrays, false))
+  if (args.hasFlag(clang::options::OPT_fstack_arrays,
+                   clang::options::OPT_fno_stack_arrays, false))
     opts.StackArrays = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
+  if (args.getLastArg(clang::options::OPT_floop_interchange))
     opts.InterchangeLoops = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion))
+  if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion))
     opts.FuseLoops = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
+  if (args.getLastArg(clang::options::OPT_vectorize_loops))
     opts.VectorizeLoop = 1;
 
-  if (args.getLastArg(clang::driver::options::OPT_vectorize_slp))
+  if (args.getLastArg(clang::options::OPT_vectorize_slp))
     opts.VectorizeSLP = 1;
 
-  if (args.hasFlag(clang::driver::options::OPT_floop_versioning,
-                   clang::driver::options::OPT_fno_loop_versioning, false))
+  if (args.hasFlag(clang::options::OPT_floop_versioning,
+                   clang::options::OPT_fno_loop_versioning, false))
     opts.LoopVersioning = 1;
 
-  opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops,
-                                  clang::driver::options::OPT_fno_unroll_loops,
+  opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops,
+                                  clang::options::OPT_fno_unroll_loops,
                                   (opts.OptimizationLevel > 1));
 
   opts.AliasAnalysis = opts.OptimizationLevel > 0;
 
   // -mframe-pointer=none/non-leaf/reserved/all option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) {
+          args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) {
     std::optional<llvm::FramePointerKind> val =
         llvm::StringSwitch<std::optional<llvm::FramePointerKind>>(a->getValue())
             .Case("none", llvm::FramePointerKind::None)
@@ -322,7 +320,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setFramePointer(val.value());
   }
 
-  for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ))
+  for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ))
     opts.LLVMPassPlugins.push_back(a->getValue());
 
   opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args);
@@ -331,15 +329,14 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       clang::driver::tools::parseMPreferVectorWidthOption(diags, args);
 
   // -fembed-offload-object option
-  for (auto *a :
-       args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ))
+  for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ))
     opts.OffloadObjects.push_back(a->getValue());
 
-  if (args.hasArg(clang::driver::options::OPT_finstrument_functions))
+  if (args.hasArg(clang::options::OPT_finstrument_functions))
     opts.InstrumentFunctions = 1;
 
-  if (const llvm::opt::Arg *a = args.getLastArg(
-          clang::driver::options::OPT_mcode_object_version_EQ)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) {
     llvm::StringRef s = a->getValue();
     if (s == "6")
       opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6;
@@ -353,36 +350,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 
   // -f[no-]save-optimization-record[=<format>]
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_file))
+          args.getLastArg(clang::options::OPT_opt_record_file))
     opts.OptRecordFile = a->getValue();
 
   // Optimization file format. Defaults to yaml
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_format))
+          args.getLastArg(clang::options::OPT_opt_record_format))
     opts.OptRecordFormat = a->getValue();
 
   // Specifies, using a regex, which successful optimization passes(middle and
   // backend), to include in the final optimization record file generated. If
   // not provided -fsave-optimization-record will include all passes.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_opt_record_passes))
+          args.getLastArg(clang::options::OPT_opt_record_passes))
     opts.OptRecordPasses = a->getValue();
 
   // Create OptRemark that allows printing of all successful optimization
   // passes applied.
   opts.OptimizationRemark =
-      parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ,
+      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ,
                               /*remarkOptName=*/"pass");
 
   // Create OptRemark that allows all missed optimization passes to be printed.
-  opts.OptimizationRemarkMissed = parseOptimizationRemark(
-      diags, args, clang::driver::options::OPT_Rpass_missed_EQ,
-      /*remarkOptName=*/"pass-missed");
+  opts.OptimizationRemarkMissed =
+      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ,
+                              /*remarkOptName=*/"pass-missed");
 
   // Create OptRemark that allows all optimization decisions made by LLVM
   // to be printed.
   opts.OptimizationRemarkAnalysis = parseOptimizationRemark(
-      diags, args, clang::driver::options::OPT_Rpass_analysis_EQ,
+      diags, args, clang::options::OPT_Rpass_analysis_EQ,
       /*remarkOptName=*/"pass-analysis");
 
   if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) {
@@ -400,23 +397,22 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly);
   }
 
-  if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ))
+  if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ))
     opts.SaveTempsDir = a->getValue();
 
   // -record-command-line option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_record_command_line)) {
+          args.getLastArg(clang::options::OPT_record_command_line)) {
     opts.RecordCommandLine = a->getValue();
   }
 
   // -mlink-builtin-bitcode
-  for (auto *a :
-       args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode))
+  for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode))
     opts.BuiltinBCLibs.push_back(a->getValue());
 
   // -mrelocation-model option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mrelocation_model)) {
+          args.getLastArg(clang::options::OPT_mrelocation_model)) {
     llvm::StringRef modelName = a->getValue();
     auto relocModel =
         llvm::StringSwitch<std::optional<llvm::Reloc::Model>>(modelName)
@@ -435,31 +431,30 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // -pic-level and -pic-is-pie option.
-  if (int picLevel = getLastArgIntValue(
-          args, clang::driver::options::OPT_pic_level, 0, diags)) {
+  if (int picLevel =
+          getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) {
     if (picLevel > 2)
       diags.Report(clang::diag::err_drv_invalid_value)
-          << args.getLastArg(clang::driver::options::OPT_pic_level)
-                 ->getAsString(args)
+          << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args)
           << picLevel;
 
     opts.PICLevel = picLevel;
-    if (args.hasArg(clang::driver::options::OPT_pic_is_pie))
+    if (args.hasArg(clang::options::OPT_pic_is_pie))
       opts.IsPIE = 1;
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) {
+  if (args.hasArg(clang::options::OPT_fprofile_generate)) {
     opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr);
   }
 
-  if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) {
+  if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) {
     opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
     opts.ProfileInstrumentUsePath = A->getValue();
   }
 
   // -mcmodel option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) {
+          args.getLastArg(clang::options::OPT_mcmodel_EQ)) {
     llvm::StringRef modelName = a->getValue();
     std::optional<llvm::CodeModel::Model> codeModel = getCodeModel(modelName);
 
@@ -470,8 +465,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
           << a->getAsString(args) << modelName;
   }
 
-  if (const llvm::opt::Arg *arg = args.getLastArg(
-          clang::driver::options::OPT_mlarge_data_threshold_EQ)) {
+  if (const llvm::opt::Arg *arg =
+          args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) {
     uint64_t LDT;
     if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) {
       diags.Report(clang::diag::err_drv_invalid_value)
@@ -481,15 +476,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // This option is compatible with -f[no-]underscoring in gfortran.
-  if (args.hasFlag(clang::driver::options::OPT_fno_underscoring,
-                   clang::driver::options::OPT_funderscoring, false)) {
+  if (args.hasFlag(clang::options::OPT_fno_underscoring,
+                   clang::options::OPT_funderscoring, false)) {
     opts.Underscoring = 0;
   }
 
   parseDoConcurrentMapping(opts, args, diags);
 
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) {
+          args.getLastArg(clang::options::OPT_complex_range_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     if (argValue == "full") {
       opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full);
@@ -510,46 +505,42 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 /// \param [in] opts The target options instance to update
 /// \param [in] args The list of input arguments (from the compiler invocation)
 static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_triple))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple))
     opts.triple = a->getValue();
 
-  opts.atomicIgnoreDenormalMode = args.hasFlag(
-      clang::driver::options::OPT_fatomic_ignore_denormal_mode,
-      clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false);
-  opts.atomicFineGrainedMemory = args.hasFlag(
-      clang::driver::options::OPT_fatomic_fine_grained_memory,
-      clang::driver::options::OPT_fno_atomic_fine_grained_memory, false);
+  opts.atomicIgnoreDenormalMode =
+      args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode,
+                   clang::options::OPT_fno_atomic_ignore_denormal_mode, false);
+  opts.atomicFineGrainedMemory =
+      args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory,
+                   clang::options::OPT_fno_atomic_fine_grained_memory, false);
   opts.atomicRemoteMemory =
-      args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory,
-                   clang::driver::options::OPT_fno_atomic_remote_memory, false);
+      args.hasFlag(clang::options::OPT_fatomic_remote_memory,
+                   clang::options::OPT_fno_atomic_remote_memory, false);
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_target_cpu))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu))
     opts.cpu = a->getValue();
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_tune_cpu))
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu))
     opts.cpuToTuneFor = a->getValue();
 
   for (const llvm::opt::Arg *currentArg :
-       args.filtered(clang::driver::options::OPT_target_feature))
+       args.filtered(clang::options::OPT_target_feature))
     opts.featuresAsWritten.emplace_back(currentArg->getValue());
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_real_10))
+  if (args.hasArg(clang::options::OPT_fdisable_real_10))
     opts.disabledRealKinds.push_back(10);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_real_3))
+  if (args.hasArg(clang::options::OPT_fdisable_real_3))
     opts.disabledRealKinds.push_back(3);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2))
+  if (args.hasArg(clang::options::OPT_fdisable_integer_2))
     opts.disabledIntegerKinds.push_back(2);
 
-  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16))
+  if (args.hasArg(clang::options::OPT_fdisable_integer_16))
     opts.disabledIntegerKinds.push_back(16);
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) {
     opts.abi = a->getValue();
     llvm::StringRef V = a->getValue();
     if (V == "vec-extabi") {
@@ -559,9 +550,8 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
     }
   }
 
-  opts.asmVerbose =
-      args.hasFlag(clang::driver::options::OPT_fverbose_asm,
-                   clang::driver::options::OPT_fno_verbose_asm, false);
+  opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm,
+                                 clang::options::OPT_fno_verbose_asm, false);
 }
 // Tweak the frontend configuration based on the frontend action
 static void setUpFrontendBasedOnAction(FrontendOptions &opts) {
@@ -594,11 +584,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   // Treat multiple action options as an invocation error. Note that `clang
   // -cc1` does accept multiple action options, but will only consider the
   // rightmost one.
-  if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) {
+  if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) {
     llvm::SmallString<32> buf;
     llvm::raw_svector_ostream os(buf);
     for (const llvm::opt::Arg *arg :
-         args.filtered(clang::driver::options::OPT_Action_Group)) {
+         args.filtered(clang::options::OPT_Action_Group)) {
       if (buf.size())
         os << ", ";
       os << "'" << arg->getSpelling() << "'";
@@ -609,99 +599,99 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
 
   // Identify the action (i.e. opts.ProgramAction)
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_Action_Group)) {
+          args.getLastArg(clang::options::OPT_Action_Group)) {
     switch (a->getOption().getID()) {
     default: {
       llvm_unreachable("Invalid option in group!");
     }
-    case clang::driver::options::OPT_test_io:
+    case clang::options::OPT_test_io:
       opts.programAction = InputOutputTest;
       break;
-    case clang::driver::options::OPT_E:
+    case clang::options::OPT_E:
       opts.programAction = PrintPreprocessedInput;
       break;
-    case clang::driver::options::OPT_fsyntax_only:
+    case clang::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
-    case clang::driver::options::OPT_emit_fir:
+    case clang::options::OPT_emit_fir:
       opts.programAction = EmitFIR;
       break;
-    case clang::driver::options::OPT_emit_hlfir:
+    case clang::options::OPT_emit_hlfir:
       opts.programAction = EmitHLFIR;
       break;
-    case clang::driver::options::OPT_emit_llvm:
+    case clang::options::OPT_emit_llvm:
       opts.programAction = EmitLLVM;
       break;
-    case clang::driver::options::OPT_emit_llvm_bc:
+    case clang::options::OPT_emit_llvm_bc:
       opts.programAction = EmitLLVMBitcode;
       break;
-    case clang::driver::options::OPT_emit_obj:
+    case clang::options::OPT_emit_obj:
       opts.programAction = EmitObj;
       break;
-    case clang::driver::options::OPT_S:
+    case clang::options::OPT_S:
       opts.programAction = EmitAssembly;
       break;
-    case clang::driver::options::OPT_fdebug_unparse:
+    case clang::options::OPT_fdebug_unparse:
       opts.programAction = DebugUnparse;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_no_sema:
+    case clang::options::OPT_fdebug_unparse_no_sema:
       opts.programAction = DebugUnparseNoSema;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_with_symbols:
+    case clang::options::OPT_fdebug_unparse_with_symbols:
       opts.programAction = DebugUnparseWithSymbols;
       break;
-    case clang::driver::options::OPT_fdebug_unparse_with_modules:
+    case clang::options::OPT_fdebug_unparse_with_modules:
       opts.programAction = DebugUnparseWithModules;
       break;
-    case clang::driver::options::OPT_fdebug_dump_symbols:
+    case clang::options::OPT_fdebug_dump_symbols:
       opts.programAction = DebugDumpSymbols;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parse_tree:
+    case clang::options::OPT_fdebug_dump_parse_tree:
       opts.programAction = DebugDumpParseTree;
       break;
-    case clang::driver::options::OPT_fdebug_dump_pft:
+    case clang::options::OPT_fdebug_dump_pft:
       opts.programAction = DebugDumpPFT;
       break;
-    case clang::driver::options::OPT_fdebug_dump_all:
+    case clang::options::OPT_fdebug_dump_all:
       opts.programAction = DebugDumpAll;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema:
+    case clang::options::OPT_fdebug_dump_parse_tree_no_sema:
       opts.programAction = DebugDumpParseTreeNoSema;
       break;
-    case clang::driver::options::OPT_fdebug_dump_provenance:
+    case clang::options::OPT_fdebug_dump_provenance:
       opts.programAction = DebugDumpProvenance;
       break;
-    case clang::driver::options::OPT_fdebug_dump_parsing_log:
+    case clang::options::OPT_fdebug_dump_parsing_log:
       opts.programAction = DebugDumpParsingLog;
       break;
-    case clang::driver::options::OPT_fdebug_measure_parse_tree:
+    case clang::options::OPT_fdebug_measure_parse_tree:
       opts.programAction = DebugMeasureParseTree;
       break;
-    case clang::driver::options::OPT_fdebug_pre_fir_tree:
+    case clang::options::OPT_fdebug_pre_fir_tree:
       opts.programAction = DebugPreFIRTree;
       break;
-    case clang::driver::options::OPT_fget_symbols_sources:
+    case clang::options::OPT_fget_symbols_sources:
       opts.programAction = GetSymbolsSources;
       break;
-    case clang::driver::options::OPT_fget_definition:
+    case clang::options::OPT_fget_definition:
       opts.programAction = GetDefinition;
       break;
-    case clang::driver::options::OPT_init_only:
+    case clang::options::OPT_init_only:
       opts.programAction = InitOnly;
       break;
 
       // TODO:
-      // case clang::driver::options::OPT_emit_llvm:
-      // case clang::driver::options::OPT_emit_llvm_only:
-      // case clang::driver::options::OPT_emit_codegen_only:
-      // case clang::driver::options::OPT_emit_module:
+      // case clang::options::OPT_emit_llvm:
+      // case clang::options::OPT_emit_llvm_only:
+      // case clang::options::OPT_emit_codegen_only:
+      // case clang::options::OPT_emit_module:
       // (...)
     }
 
     // Parse the values provided with `-fget-definition` (there should be 3
     // integers)
     if (llvm::opt::OptSpecifier(a->getOption().getID()) ==
-        clang::driver::options::OPT_fget_definition) {
+        clang::options::OPT_fget_definition) {
       unsigned optVals[3] = {0, 0, 0};
 
       for (unsigned i = 0; i < 3; i++) {
@@ -721,27 +711,25 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Parsing -load <dsopath> option and storing shared object path
-  if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) {
+  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) {
     opts.plugins.push_back(a->getValue());
   }
 
   // Parsing -plugin <name> option and storing plugin name and setting action
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_plugin)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) {
     opts.programAction = PluginAction;
     opts.actionName = a->getValue();
   }
 
-  opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o);
-  opts.showHelp = args.hasArg(clang::driver::options::OPT_help);
-  opts.showVersion = args.hasArg(clang::driver::options::OPT_version);
+  opts.outputFile = args.getLastArgValue(clang::options::OPT_o);
+  opts.showHelp = args.hasArg(clang::options::OPT_help);
+  opts.showVersion = args.hasArg(clang::options::OPT_version);
   opts.printSupportedCPUs =
-      args.hasArg(clang::driver::options::OPT_print_supported_cpus);
+      args.hasArg(clang::options::OPT_print_supported_cpus);
 
   // Get the input kind (from the value passed via `-x`)
   InputKind dashX(Language::Unknown);
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_x)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) {
     llvm::StringRef xValue = a->getValue();
     // Principal languages.
     dashX = llvm::StringSwitch<InputKind>(xValue)
@@ -768,7 +756,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
 
   // Collect the input files and save them in our instance of FrontendOptions.
   std::vector<std::string> inputs =
-      args.getAllArgValues(clang::driver::options::OPT_INPUT);
+      args.getAllArgValues(clang::options::OPT_INPUT);
   opts.inputs.clear();
   if (inputs.empty())
     // '-' is the default input if none is given.
@@ -788,18 +776,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set fortranForm based on options -ffree-form and -ffixed-form.
-  if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_ffixed_form,
-                          clang::driver::options::OPT_ffree_form)) {
-    opts.fortranForm =
-        arg->getOption().matches(clang::driver::options::OPT_ffixed_form)
-            ? FortranForm::FixedForm
-            : FortranForm::FreeForm;
+  if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form,
+                                        clang::options::OPT_ffree_form)) {
+    opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form)
+                           ? FortranForm::FixedForm
+                           : FortranForm::FreeForm;
   }
 
   // Set fixedFormColumns based on -ffixed-line-length=<value>
   if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) {
+          args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     std::int64_t columns = -1;
     if (argValue == "none") {
@@ -821,8 +807,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set conversion based on -fconvert=<value>
-  if (const auto *arg =
-          args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) {
+  if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) {
     const char *argValue = arg->getValue();
     if (auto convert = parseConvertArg(argValue))
       opts.envDefaults.push_back({"FORT_CONVERT", *convert});
@@ -832,59 +817,55 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // -f{no-}implicit-none
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
-      args.hasFlag(clang::driver::options::OPT_fimplicit_none,
-                   clang::driver::options::OPT_fno_implicit_none, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
+                       args.hasFlag(clang::options::OPT_fimplicit_none,
+                                    clang::options::OPT_fno_implicit_none,
+                                    false));
 
   // -f{no-}implicit-none-ext
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::ImplicitNoneExternal,
-      args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext,
-                   clang::driver::options::OPT_fno_implicit_none_ext, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal,
+                       args.hasFlag(clang::options::OPT_fimplicit_none_ext,
+                                    clang::options::OPT_fno_implicit_none_ext,
+                                    false));
 
   // -f{no-}backslash
   opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes,
-                       args.hasFlag(clang::driver::options::OPT_fbackslash,
-                                    clang::driver::options::OPT_fno_backslash,
-                                    false));
+                       args.hasFlag(clang::options::OPT_fbackslash,
+                                    clang::options::OPT_fno_backslash, false));
 
   // -f{no-}logical-abbreviations
   opts.features.Enable(
       Fortran::common::LanguageFeature::LogicalAbbreviations,
-      args.hasFlag(clang::driver::options::OPT_flogical_abbreviations,
-                   clang::driver::options::OPT_fno_logical_abbreviations,
-                   false));
+      args.hasFlag(clang::options::OPT_flogical_abbreviations,
+                   clang::options::OPT_fno_logical_abbreviations, false));
 
   // -f{no-}unsigned
   opts.features.Enable(Fortran::common::LanguageFeature::Unsigned,
-                       args.hasFlag(clang::driver::options::OPT_funsigned,
-                                    clang::driver::options::OPT_fno_unsigned,
-                                    false));
+                       args.hasFlag(clang::options::OPT_funsigned,
+                                    clang::options::OPT_fno_unsigned, false));
 
   // -f{no-}xor-operator
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::XOROperator,
-      args.hasFlag(clang::driver::options::OPT_fxor_operator,
-                   clang::driver::options::OPT_fno_xor_operator, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::XOROperator,
+                       args.hasFlag(clang::options::OPT_fxor_operator,
+                                    clang::options::OPT_fno_xor_operator,
+                                    false));
 
   // -fno-automatic
-  if (args.hasArg(clang::driver::options::OPT_fno_automatic)) {
+  if (args.hasArg(clang::options::OPT_fno_automatic)) {
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
   // -f{no}-save-main-program
-  opts.features.Enable(
-      Fortran::common::LanguageFeature::SaveMainProgram,
-      args.hasFlag(clang::driver::options::OPT_fsave_main_program,
-                   clang::driver::options::OPT_fno_save_main_program, false));
+  opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram,
+                       args.hasFlag(clang::options::OPT_fsave_main_program,
+                                    clang::options::OPT_fno_save_main_program,
+                                    false));
 
-  if (args.hasArg(
-          clang::driver::options::OPT_falternative_parameter_statement)) {
+  if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) {
     opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter);
   }
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) {
+          args.getLastArg(clang::options::OPT_finput_charset_EQ)) {
     llvm::StringRef argValue = arg->getValue();
     if (argValue == "utf-8") {
       opts.encoding = Fortran::parser::Encoding::UTF_8;
@@ -929,9 +910,9 @@ static std::string getOpenMPHeadersDir(const char *argv) {
 static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
                                   llvm::opt::ArgList &args) {
   // Add macros from the command line.
-  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D,
-                                              clang::driver::options::OPT_U)) {
-    if (currentArg->getOption().matches(clang::driver::options::OPT_D)) {
+  for (const auto *currentArg :
+       args.filtered(clang::options::OPT_D, clang::options::OPT_U)) {
+    if (currentArg->getOption().matches(clang::options::OPT_D)) {
       opts.addMacroDef(currentArg->getValue());
     } else {
       opts.addMacroUndef(currentArg->getValue());
@@ -939,34 +920,33 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
   }
 
   // Add the ordered list of -I's.
-  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I))
+  for (const auto *currentArg : args.filtered(clang::options::OPT_I))
     opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue());
 
   // Prepend the ordered list of -intrinsic-modules-path
   // to the default location to search.
   for (const auto *currentArg :
-       args.filtered(clang::driver::options::OPT_fintrinsic_modules_path))
+       args.filtered(clang::options::OPT_fintrinsic_modules_path))
     opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue());
 
   // -cpp/-nocpp
-  if (const auto *currentArg = args.getLastArg(
-          clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp))
-    opts.macrosFlag =
-        (currentArg->getOption().matches(clang::driver::options::OPT_cpp))
-            ? PPMacrosFlag::Include
-            : PPMacrosFlag::Exclude;
+  if (const auto *currentArg =
+          args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp))
+    opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp))
+                          ? PPMacrosFlag::Include
+                          : PPMacrosFlag::Exclude;
   // Enable -cpp based on -x unless explicitly disabled with -nocpp
   if (opts.macrosFlag != PPMacrosFlag::Exclude)
-    if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x))
+    if (const auto *dashX = args.getLastArg(clang::options::OPT_x))
       opts.macrosFlag = llvm::StringSwitch<PPMacrosFlag>(dashX->getValue())
                             .Case("f95-cpp-input", PPMacrosFlag::Include)
                             .Default(opts.macrosFlag);
 
-  opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat);
+  opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat);
   opts.preprocessIncludeLines =
-      args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines);
-  opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P);
-  opts.showMacros = args.hasArg(clang::driver::options::OPT_dM);
+      args.hasArg(clang::options::OPT_fpreprocess_include_lines);
+  opts.noLineDirectives = args.hasArg(clang::options::OPT_P);
+  opts.showMacros = args.hasArg(clang::options::OPT_dM);
 }
 
 /// Parses all semantic related arguments and populates the variables
@@ -977,7 +957,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 
   // -J/module-dir option
   std::vector<std::string> moduleDirList =
-      args.getAllArgValues(clang::driver::options::OPT_module_dir);
+      args.getAllArgValues(clang::options::OPT_module_dir);
   // User can only specify one -J/-module-dir directory, but may repeat
   // -J/-module-dir as long as the directory is the same each time.
   // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html
@@ -996,25 +976,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     res.setModuleDir(moduleDirList[0]);
 
   // -fdebug-module-writer option
-  if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) {
+  if (args.hasArg(clang::options::OPT_fdebug_module_writer)) {
     res.setDebugModuleDir(true);
   }
 
   // -fhermetic-module-files option
-  if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) {
+  if (args.hasArg(clang::options::OPT_fhermetic_module_files)) {
     res.setHermeticModuleFileOutput(true);
   }
 
   // -module-suffix
   if (const auto *moduleSuffix =
-          args.getLastArg(clang::driver::options::OPT_module_suffix)) {
+          args.getLastArg(clang::options::OPT_module_suffix)) {
     res.setModuleFileSuffix(moduleSuffix->getValue());
   }
 
   // -f{no-}analyzed-objects-for-unparse
-  res.setUseAnalyzedObjectsForUnparse(args.hasFlag(
-      clang::driver::options::OPT_fanalyzed_objects_for_unparse,
-      clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true));
+  res.setUseAnalyzedObjectsForUnparse(
+      args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse,
+                   clang::options::OPT_fno_analyzed_objects_for_unparse, true));
 
   return diags.getNumErrors() == numErrorsBefore;
 }
@@ -1031,7 +1011,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // chosen to match clang's behavior.
 
   // -pedantic
-  if (args.hasArg(clang::driver::options::OPT_pedantic)) {
+  if (args.hasArg(clang::options::OPT_pedantic)) {
     features.WarnOnAllNonstandard();
     features.WarnOnAllUsage();
     res.setEnableConformanceChecks();
@@ -1041,9 +1021,8 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -Werror option
   // TODO: Currently throws a Diagnostic for anything other than -W<error>,
   // this has to change when other -W<opt>'s are supported.
-  if (args.hasArg(clang::driver::options::OPT_W_Joined)) {
-    const auto &wArgs =
-        args.getAllArgValues(clang::driver::options::OPT_W_Joined);
+  if (args.hasArg(clang::options::OPT_W_Joined)) {
+    const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined);
     for (const auto &wArg : wArgs) {
       if (wArg == "error") {
         res.setWarnAsErr(true);
@@ -1060,7 +1039,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -w
-  if (args.hasArg(clang::driver::options::OPT_w)) {
+  if (args.hasArg(clang::options::OPT_w)) {
     features.DisableAllWarnings();
     res.setDisableWarnings();
   }
@@ -1080,7 +1059,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   unsigned numErrorsBefore = diags.getNumErrors();
 
   // -fd-lines-as-code
-  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) {
+  if (args.hasArg(clang::options::OPT_fd_lines_as_code)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1093,7 +1072,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fd-lines-as-comments
-  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) {
+  if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1106,18 +1085,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fdefault* family
-  if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_real_8)) {
     res.getDefaultKinds().set_defaultRealKind(8);
     res.getDefaultKinds().set_doublePrecisionKind(16);
   }
-  if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_integer_8)) {
     res.getDefaultKinds().set_defaultIntegerKind(8);
     res.getDefaultKinds().set_subscriptIntegerKind(8);
     res.getDefaultKinds().set_sizeIntegerKind(8);
     res.getDefaultKinds().set_defaultLogicalKind(8);
   }
-  if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) {
-    if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::options::OPT_fdefault_double_8)) {
+    if (!args.hasArg(clang::options::OPT_fdefault_real_8)) {
       // -fdefault-double-8 has to be used with -fdefault-real-8
       // to be compatible with gfortran
       const unsigned diagID = diags.getCustomDiagID(
@@ -1128,18 +1107,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html
     res.getDefaultKinds().set_doublePrecisionKind(8);
   }
-  if (args.hasArg(clang::driver::options::OPT_flarge_sizes))
+  if (args.hasArg(clang::options::OPT_flarge_sizes))
     res.getDefaultKinds().set_sizeIntegerKind(8);
 
   // -x cuda
-  auto language = args.getLastArgValue(clang::driver::options::OPT_x);
+  auto language = args.getLastArgValue(clang::options::OPT_x);
   if (language == "cuda") {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::CUDA);
   }
 
   // -fopenacc
-  if (args.hasArg(clang::driver::options::OPT_fopenacc)) {
+  if (args.hasArg(clang::options::OPT_fopenacc)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::OpenACC);
   }
@@ -1147,8 +1126,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -std=f2018
   // TODO: Set proper options when more fortran standards
   // are supported.
-  if (args.hasArg(clang::driver::options::OPT_std_EQ)) {
-    auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ);
+  if (args.hasArg(clang::options::OPT_std_EQ)) {
+    auto standard = args.getLastArgValue(clang::options::OPT_std_EQ);
     // We only allow f2018 as the given standard
     if (standard == "f2018") {
       res.setEnableConformanceChecks();
@@ -1161,7 +1140,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
   // -fcoarray
-  if (args.hasArg(clang::driver::options::OPT_fcoarray)) {
+  if (args.hasArg(clang::options::OPT_fcoarray)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::Coarray);
     const unsigned diagID =
@@ -1179,13 +1158,12 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 /// generated.
 static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
-                                        clang::driver::options::OPT_fno_openmp);
-  if (!arg ||
-      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
-    bool isSimdSpecified = args.hasFlag(
-        clang::driver::options::OPT_fopenmp_simd,
-        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
+  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp,
+                                        clang::options::OPT_fno_openmp);
+  if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) {
+    bool isSimdSpecified =
+        args.hasFlag(clang::options::OPT_fopenmp_simd,
+                     clang::options::OPT_fno_openmp_simd, /*Default=*/false);
     if (!isSimdSpecified)
       return true;
     res.getLangOpts().OpenMPSimd = 1;
@@ -1200,8 +1178,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   res.getLangOpts().OpenMPVersion = newestFullySupported;
   res.getFrontendOpts().features.Enable(
       Fortran::common::LanguageFeature::OpenMP);
-  if (auto *arg =
-          args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) {
+  if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) {
     llvm::ArrayRef<unsigned> ompVersions = llvm::omp::getOpenMPVersions();
     unsigned oldVersions[] = {11, 20, 25, 30};
     unsigned version = 0;
@@ -1254,16 +1231,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) {
+  if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) {
     res.getLangOpts().OpenMPForceUSM = 1;
   }
-  if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) {
+  if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) {
     res.getLangOpts().OpenMPIsTargetDevice = 1;
 
     // Get OpenMP host file path if any and report if a non existent file is
     // found
-    if (auto *arg = args.getLastArg(
-            clang::driver::options::OPT_fopenmp_host_ir_file_path)) {
+    if (auto *arg =
+            args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) {
       res.getLangOpts().OMPHostIRFile = arg->getValue();
       if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile))
         diags.Report(clang::diag::err_omp_host_ir_file_not_found)
@@ -1271,37 +1248,34 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
 
     if (args.hasFlag(
-            clang::driver::options::OPT_fopenmp_assume_teams_oversubscription,
-            clang::driver::options::
-                OPT_fno_openmp_assume_teams_oversubscription,
+            clang::options::OPT_fopenmp_assume_teams_oversubscription,
+            clang::options::OPT_fno_openmp_assume_teams_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPTeamSubscription = true;
 
-    if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state))
+    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state))
       res.getLangOpts().OpenMPNoThreadState = 1;
 
-    if (args.hasArg(
-            clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism))
+    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism))
       res.getLangOpts().OpenMPNoNestedParallelism = 1;
 
     if (args.hasFlag(
-            clang::driver::options::OPT_fopenmp_assume_threads_oversubscription,
-            clang::driver::options::
-                OPT_fno_openmp_assume_threads_oversubscription,
+            clang::options::OPT_fopenmp_assume_threads_oversubscription,
+            clang::options::OPT_fno_openmp_assume_threads_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPThreadSubscription = true;
 
-    if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) ||
-         args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) {
-      res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue(
-          args, clang::driver::options::OPT_fopenmp_target_debug_EQ,
-          res.getLangOpts().OpenMPTargetDebug, diags);
+    if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) ||
+         args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) {
+      res.getLangOpts().OpenMPTargetDebug =
+          getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ,
+                             res.getLangOpts().OpenMPTargetDebug, diags);
 
       if (!res.getLangOpts().OpenMPTargetDebug &&
-          args.hasArg(clang::driver::options::OPT_fopenmp_target_debug))
+          args.hasArg(clang::options::OPT_fopenmp_target_debug))
         res.getLangOpts().OpenMPTargetDebug = 1;
     }
-    if (args.hasArg(clang::driver::options::OPT_no_offloadlib))
+    if (args.hasArg(clang::options::OPT_no_offloadlib))
       res.getLangOpts().NoGPULib = 1;
   }
   if (llvm::Triple(res.getTargetOpts().triple).isGPU()) {
@@ -1317,8 +1291,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // Get the OpenMP target triples if any.
-  if (auto *arg =
-          args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) {
+  if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) {
     enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit };
     auto getArchPtrSize = [](const llvm::Triple &triple) {
       if (triple.isArch16Bit())
@@ -1361,7 +1334,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc,
                                      clang::DiagnosticsEngine &diags) {
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
-  if (args.getLastArg(clang::driver::options::OPT_fwrapv))
+  if (args.getLastArg(clang::options::OPT_fwrapv))
     opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined);
 
   return true;
@@ -1380,7 +1353,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_ffp_contract)) {
+          args.getLastArg(clang::options::OPT_ffp_contract)) {
     const llvm::StringRef val = a->getValue();
     enum Fortran::common::LangOptions::FPModeKind fpContractMode;
 
@@ -1397,31 +1370,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(fpContractMode);
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
+  if (args.getLastArg(clang::options::OPT_menable_no_infs)) {
     opts.NoHonorInfs = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) {
+  if (args.getLastArg(clang::options::OPT_menable_no_nans)) {
     opts.NoHonorNaNs = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) {
+  if (args.getLastArg(clang::options::OPT_fapprox_func)) {
     opts.ApproxFunc = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) {
+  if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) {
     opts.NoSignedZeros = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_mreassociate)) {
+  if (args.getLastArg(clang::options::OPT_mreassociate)) {
     opts.AssociativeMath = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) {
+  if (args.getLastArg(clang::options::OPT_freciprocal_math)) {
     opts.ReciprocalMath = true;
   }
 
-  if (args.getLastArg(clang::driver::options::OPT_ffast_math)) {
+  if (args.getLastArg(clang::options::OPT_ffast_math)) {
     opts.NoHonorInfs = true;
     opts.NoHonorNaNs = true;
     opts.AssociativeMath = true;
@@ -1431,7 +1404,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
-  if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
+  if (args.hasArg(clang::options::OPT_fno_fast_real_mod))
     opts.NoFastRealMod = true;
 
   return true;
@@ -1446,10 +1419,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
 /// \param [out] diags DiagnosticsEngine to report erros with
 static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  const auto *vscaleMin =
-      args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ);
-  const auto *vscaleMax =
-      args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ);
+  const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ);
+  const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ);
 
   if (!vscaleMin && !vscaleMax)
     return true;
@@ -1497,8 +1468,7 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // TODO: support --dependent-lib on other platforms when MLIR supports
   //       !llvm.dependent.lib
-  if (args.hasArg(clang::driver::options::OPT_dependent_lib) &&
-      !triple.isOSWindows()) {
+  if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) {
     const unsigned diagID =
         diags.getCustomDiagID(clang::DiagnosticsEngine::Error,
                               "--dependent-lib is only supported on Windows");
@@ -1506,12 +1476,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
     return false;
   }
 
-  opts.DependentLibs =
-      args.getAllArgValues(clang::driver::options::OPT_dependent_lib);
+  opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib);
 
   // -flto=full/thin option.
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::driver::options::OPT_flto_EQ)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) {
     llvm::StringRef s = a->getValue();
     assert((s == "full" || s == "thin") && "Unknown LTO mode.");
     if (s == "full")
@@ -1522,10 +1490,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // -ffat-lto-objects
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::driver::options::OPT_ffat_lto_objects,
-                          clang::driver::options::OPT_fno_fat_lto_objects)) {
+          args.getLastArg(clang::options::OPT_ffat_lto_objects,
+                          clang::options::OPT_fno_fat_lto_objects)) {
     opts.PrepareForFatLTO =
-        arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects);
+        arg->getOption().matches(clang::options::OPT_ffat_lto_objects);
     if (opts.PrepareForFatLTO) {
       assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) &&
              "Unknown LTO mode");
@@ -1566,8 +1534,8 @@ bool CompilerInvocation::createFromArgs(
       llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple());
 
   // Parse the arguments
-  const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable();
-  llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option);
+  const llvm::opt::OptTable &opts = clang::getDriverOptTable();
+  llvm::opt::Visibility visibilityMask(clang::options::FC1Option);
   unsigned missingArgIndex, missingArgCount;
   llvm::opt::InputArgList args = opts.ParseArgs(
       commandLineArgs, missingArgIndex, missingArgCount, visibilityMask);
@@ -1580,7 +1548,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Issue errors on unknown arguments
-  for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) {
+  for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) {
     auto argString = a->getAsString(args);
     std::string nearest;
     if (opts.findNearest(argString, nearest, visibilityMask) > 1)
@@ -1592,15 +1560,15 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -flang-experimental-hlfir
-  if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) ||
-      args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
+  if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) ||
+      args.hasArg(clang::options::OPT_emit_hlfir)) {
     invoc.loweringOpts.setLowerToHighLevelFIR(true);
   }
 
   // -flang-deprecated-no-hlfir
-  if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) &&
-      !args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
-    if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) {
+  if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) &&
+      !args.hasArg(clang::options::OPT_emit_hlfir)) {
+    if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) {
       const unsigned diagID = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Error,
           "Options '-flang-experimental-hlfir' and "
@@ -1611,13 +1579,13 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -fno-ppc-native-vector-element-order
-  if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) {
+  if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) {
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
   // -f[no-]init-global-zero
-  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
-                   clang::driver::options::OPT_fno_init_global_zero,
+  if (args.hasFlag(clang::options::OPT_finit_global_zero,
+                   clang::options::OPT_fno_init_global_zero,
                    /*default=*/true))
     invoc.loweringOpts.setInitGlobalZero(true);
   else
@@ -1626,8 +1594,8 @@ bool CompilerInvocation::createFromArgs(
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
-  for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) {
-    if (a->getOption().matches(clang::driver::options::OPT_R_value_Group))
+  for (auto *a : args.filtered(clang::options::OPT_R_Group)) {
+    if (a->getOption().matches(clang::options::OPT_R_value_Group))
       // This is -Rfoo=, where foo is the name of the diagnostic
       // group. Add only the remark option name to the diagnostics. e.g. for
       // -Rpass= we will add the string "pass".
@@ -1640,20 +1608,19 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -frealloc-lhs is the default.
-  if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs,
-                    clang::driver::options::OPT_fno_realloc_lhs, true))
+  if (!args.hasFlag(clang::options::OPT_frealloc_lhs,
+                    clang::options::OPT_fno_realloc_lhs, true))
     invoc.loweringOpts.setReallocateLHS(false);
 
-  invoc.loweringOpts.setRepackArrays(
-      args.hasFlag(clang::driver::options::OPT_frepack_arrays,
-                   clang::driver::options::OPT_fno_repack_arrays,
-                   /*default=*/false));
+  invoc.loweringOpts.setRepackArrays(args.hasFlag(
+      clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays,
+      /*default=*/false));
   invoc.loweringOpts.setStackRepackArrays(
-      args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays,
-                   clang::driver::options::OPT_fno_stack_repack_arrays,
+      args.hasFlag(clang::options::OPT_fstack_repack_arrays,
+                   clang::options::OPT_fno_stack_repack_arrays,
                    /*default=*/false));
-  if (auto *arg = args.getLastArg(
-          clang::driver::options::OPT_frepack_arrays_contiguity_EQ))
+  if (auto *arg =
+          args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ))
     invoc.loweringOpts.setRepackArraysWhole(arg->getValue() ==
                                             llvm::StringRef{"whole"});
 
@@ -1673,10 +1640,8 @@ bool CompilerInvocation::createFromArgs(
   // `mlirArgs`. Instead, you can use
   //    * `-mllvm <your-llvm-option>`, or
   //    * `-mmlir <your-mlir-option>`.
-  invoc.frontendOpts.llvmArgs =
-      args.getAllArgValues(clang::driver::options::OPT_mllvm);
-  invoc.frontendOpts.mlirArgs =
-      args.getAllArgValues(clang::driver::options::OPT_mmlir);
+  invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm);
+  invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir);
 
   success &= parseLangOptionsArgs(invoc, args, diags);
 
@@ -1700,7 +1665,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Process the timing-related options.
-  if (args.hasArg(clang::driver::options::OPT_ftime_report))
+  if (args.hasArg(clang::options::OPT_ftime_report))
     invoc.enableTimers = true;
 
   invoc.setArgv0(argv0);
diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt
index faf56e9d955a1..b69436c36d438 100644
--- a/flang/lib/FrontendTool/CMakeLists.txt
+++ b/flang/lib/FrontendTool/CMakeLists.txt
@@ -18,5 +18,6 @@ add_flang_library(flangFrontendTool
 
   CLANG_LIBS
   clangBasic
+  clangOptions
   clangDriver
 )
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 09ac129d3e689..7586be59ba01b 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -23,7 +23,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Driver/Options.h"
+#include "clang/Options/Options.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/BuryPointer.h"
@@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng,
 bool executeCompilerInvocation(CompilerInstance *flang) {
   // Honor -help.
   if (flang->getFrontendOpts().showHelp) {
-    clang::driver::getDriverOptTable().printHelp(
+    clang::getDriverOptTable().printHelp(
         llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(clang::driver::options::FC1Option));
+        llvm::opt::Visibility(clang::options::FC1Option));
     return true;
   }
 
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index b5d6727025121..4dfc0d40cd55d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -26,6 +26,7 @@ target_link_libraries(flang
 clang_target_link_libraries(flang
   PRIVATE
   clangDriver
+  clangOptions
   clangBasic
 )
 
diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp
index bd878b7a642f1..0840255a739f3 100644
--- a/flang/tools/flang-driver/driver.cpp
+++ b/flang/tools/flang-driver/driver.cpp
@@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef<const char *> argv) {
   // Any errors that would be diagnosed here will also be diagnosed later,
   // when the DiagnosticsEngine actually exists.
   unsigned missingArgIndex, missingArgCount;
-  llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs(
+  llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs(
       argv.slice(1), missingArgIndex, missingArgCount,
-      llvm::opt::Visibility(clang::driver::options::FlangOption));
+      llvm::opt::Visibility(clang::options::FlangOption));
 
   (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args);
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 8b4a3e0a7c3fb..bfbd85ea34203 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -56,7 +56,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static Status ExceptionMaskValidator(const char *string, void *unused) {
@@ -1124,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType(
 #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...)                           \
   llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET];                     \
   (void)opt_##VAR;
-#include "clang/Driver/Options.inc"
+#include "clang/Options/Options.inc"
 #undef OPTION
     minimum_version_option << '-';
     switch (sdk_type) {
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 7a2e9ad154f01..ea8ac6dbe6e34 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -401,7 +401,7 @@ class LLVM_ABI MCAsmInfo {
   // Generated object files can use all ELF features supported by GNU ld of
   // this binutils version and later. INT_MAX means all features can be used,
   // regardless of GNU ld support. The default value is referenced by
-  // clang/Driver/Options.td.
+  // clang/Options/Options.td.
   std::pair<int, int> BinutilsVersion = {2, 26};
 
   /// Should we use the integrated assembler?
diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h
index b1e56b58da684..496373d28600f 100644
--- a/llvm/include/llvm/Option/Arg.h
+++ b/llvm/include/llvm/Option/Arg.h
@@ -51,7 +51,7 @@ class Arg {
   /// Was this argument used to affect compilation?
   ///
   /// This is used to generate an "argument unused" warning (without
-  /// clang::driver::options::TargetSpecific) or "unsupported option" error
+  /// clang::options::TargetSpecific) or "unsupported option" error
   /// (with TargetSpecific).
   mutable unsigned Claimed : 1;
 
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 4d279bffd3723..36b71dd49ef13 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1501,9 +1501,9 @@ cc_library(
 
 gentbl_cc_library(
     name = "driver_options_inc_gen",
-    tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
-    td_file = "include/clang/Driver/Options.td",
+    td_file = "include/clang/Options/Options.td",
     deps = ["//llvm:OptParserTdFiles"],
 )
 

From 4ae7348513c700cf0a34a65094e925cba1084424 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 10 Nov 2025 12:32:43 -0800
Subject: [PATCH 02/97] [DirectX] Teach DXILResourceAccess about cbuffers
 (#164554)

This isn't reachable today but will come into play once we reorder
passes for #147352 and #147351.

Note that the `CBufferRowIntrin` helper struct is copied from the
`DXILCBufferAccess` pass, but it will be removed from there when we
simplify that pass in #147351
---
 .../lib/Target/DirectX/DXILResourceAccess.cpp | 156 ++++++++++++++++--
 .../load-cbuffer-array-of-struct.ll           |  59 +++++++
 .../load-cbuffer-array-of-vector.ll           |  49 ++++++
 .../load-cbuffer-array-typedgep.ll            |  30 ++++
 .../ResourceAccess/load-cbuffer-arrays.ll     | 145 ++++++++++++++++
 .../load-cbuffer-dynamic-struct.ll            |  64 +++++++
 .../ResourceAccess/load-cbuffer-dynamic.ll    |  46 ++++++
 .../ResourceAccess/load-cbuffer-scalars.ll    | 101 ++++++++++++
 .../ResourceAccess/load-cbuffer-vectors.ll    | 121 ++++++++++++++
 9 files changed, 761 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll

diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 6579d3405cf39..057d87bc3c6a9 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -10,6 +10,7 @@
 #include "DirectX.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/DXILResource.h"
+#include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -20,6 +21,7 @@
 #include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
 #define DEBUG_TYPE "dxil-resource-access"
@@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset,
   APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
   if (GEP->accumulateConstantOffset(DL, ConstantOffset)) {
     APInt Scaled = ConstantOffset.udiv(ScalarSize);
-    return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled);
+    return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled);
   }
 
-  auto IndexIt = GEP->idx_begin();
-  assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
-         "GEP is not indexing through pointer");
-  ++IndexIt;
-  Value *Offset = *IndexIt;
-  assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
-  return Offset;
+  unsigned NumIndices = GEP->getNumIndices();
+
+  // If we have a single index we're indexing into a top level array. This
+  // generally only happens with cbuffers.
+  if (NumIndices == 1)
+    return *GEP->idx_begin();
+
+  // If we have two indices, this should be a simple access through a pointer.
+  if (NumIndices == 2) {
+    auto IndexIt = GEP->idx_begin();
+    assert(cast<ConstantInt>(IndexIt)->getZExtValue() == 0 &&
+           "GEP is not indexing through pointer");
+    ++IndexIt;
+    Value *Offset = *IndexIt;
+    assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP");
+    return Offset;
+  }
+
+  llvm_unreachable("Unhandled GEP structure for resource access");
 }
 
 static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI,
@@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) {
   LI->replaceAllUsesWith(V);
 }
 
+namespace {
+/// Helper for building a `load.cbufferrow` intrinsic given a simple type.
+struct CBufferRowIntrin {
+  Intrinsic::ID IID;
+  Type *RetTy;
+  unsigned int EltSize;
+  unsigned int NumElts;
+
+  CBufferRowIntrin(const DataLayout &DL, Type *Ty) {
+    assert(Ty == Ty->getScalarType() && "Expected scalar type");
+
+    switch (DL.getTypeSizeInBits(Ty)) {
+    case 16:
+      IID = Intrinsic::dx_resource_load_cbufferrow_8;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty);
+      EltSize = 2;
+      NumElts = 8;
+      break;
+    case 32:
+      IID = Intrinsic::dx_resource_load_cbufferrow_4;
+      RetTy = StructType::get(Ty, Ty, Ty, Ty);
+      EltSize = 4;
+      NumElts = 4;
+      break;
+    case 64:
+      IID = Intrinsic::dx_resource_load_cbufferrow_2;
+      RetTy = StructType::get(Ty, Ty);
+      EltSize = 8;
+      NumElts = 2;
+      break;
+    default:
+      llvm_unreachable("Only 16, 32, and 64 bit types supported");
+    }
+  }
+};
+} // namespace
+
+static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset,
+                              dxil::ResourceTypeInfo &RTI) {
+  const DataLayout &DL = LI->getDataLayout();
+
+  Type *Ty = LI->getType();
+  assert(!isa<StructType>(Ty) && "Structs not handled yet");
+  CBufferRowIntrin Intrin(DL, Ty->getScalarType());
+
+  StringRef Name = LI->getName();
+  Value *Handle = II->getOperand(0);
+
+  IRBuilder<> Builder(LI);
+
+  ConstantInt *GlobalOffset = dyn_cast<ConstantInt>(II->getOperand(1));
+  assert(GlobalOffset && "CBuffer getpointer index must be constant");
+
+  unsigned int FixedOffset = GlobalOffset->getZExtValue();
+  // If we have a further constant offset we can just fold it in to the fixed
+  // offset.
+  if (auto *ConstOffset = dyn_cast_if_present<ConstantInt>(Offset)) {
+    FixedOffset += ConstOffset->getZExtValue();
+    Offset = nullptr;
+  }
+
+  Value *CurrentRow = ConstantInt::get(
+      Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes);
+  unsigned int CurrentIndex =
+      (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize;
+
+  assert(!(CurrentIndex && Offset) &&
+         "Dynamic indexing into elements of cbuffer rows is not supported");
+  // At this point if we have a non-constant offset it has to be an array
+  // offset, so we can assume that it's a multiple of the row size.
+  if (Offset)
+    CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset;
+
+  auto *CBufLoad = Builder.CreateIntrinsic(
+      Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load");
+  auto *Elt =
+      Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract");
+
+  // At this point we've loaded the first scalar of our result, but our original
+  // type may have been a vector.
+  unsigned int Remaining =
+      ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1;
+  if (Remaining == 0) {
+    // We only have a single element, so we're done.
+    Value *Result = Elt;
+
+    // However, if we loaded a <1 x T>, then we need to adjust the type.
+    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+      assert(VT->getNumElements() == 1 && "Can't have multiple elements here");
+      Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result,
+                                           Builder.getInt32(0), Name);
+    }
+    LI->replaceAllUsesWith(Result);
+    return;
+  }
+
+  // Walk each element and extract it, wrapping to new rows as needed.
+  SmallVector<Value *> Extracts{Elt};
+  while (Remaining--) {
+    CurrentIndex %= Intrin.NumElts;
+
+    if (CurrentIndex == 0) {
+      CurrentRow = Builder.CreateAdd(CurrentRow,
+                                     ConstantInt::get(Builder.getInt32Ty(), 1));
+      CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID,
+                                         {Handle, CurrentRow}, nullptr,
+                                         Name + ".load");
+    }
+
+    Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++},
+                                                  Name + ".extract"));
+  }
+
+  // Finally, we build up the original loaded value.
+  Value *Result = PoisonValue::get(Ty);
+  for (int I = 0, E = Extracts.size(); I < E; ++I)
+    Result = Builder.CreateInsertElement(
+        Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I));
+  LI->replaceAllUsesWith(Result);
+}
+
 static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
                                 dxil::ResourceTypeInfo &RTI) {
   switch (RTI.getResourceKind()) {
@@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::RawBuffer:
   case dxil::ResourceKind::StructuredBuffer:
     return createRawLoad(II, LI, Offset);
+  case dxil::ResourceKind::CBuffer:
+    return createCBufferLoad(II, LI, Offset, RTI);
   case dxil::ResourceKind::Texture1D:
   case dxil::ResourceKind::Texture2D:
   case dxil::ResourceKind::Texture2DMS:
@@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset,
   case dxil::ResourceKind::TextureCubeArray:
   case dxil::ResourceKind::FeedbackTexture2D:
   case dxil::ResourceKind::FeedbackTexture2DArray:
-  case dxil::ResourceKind::CBuffer:
   case dxil::ResourceKind::TBuffer:
-    // TODO: handle these
+    reportFatalUsageError("Load not yet implemented for resource type");
     return;
   case dxil::ResourceKind::Sampler:
   case dxil::ResourceKind::RTAccelerationStructure:
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll
new file mode 100644
index 0000000000000..22fba8c1d5f8c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll
@@ -0,0 +1,59 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for indexed types in dynamically indexed arrays in cbuffers.
+;
+; struct S {
+;   float x[2];
+;   uint q;
+; };
+; cbuffer CB : register(b0) {
+;   uint32_t3 w[3]; // offset  0, size 12 (+4) * 3
+;   S v[3];         // offset 48, size 24 (+8) * 3
+; }
+%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }>
+%__cblayout_CB = type <{
+  <{
+    [2 x <{ <3 x i32>, target("dx.Padding", 4) }>],
+    <3 x i32>
+  }>,
+  target("dx.Padding", 4),
+  <{
+    [2 x <{ %S, target("dx.Padding", 8) }>], %S
+  }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; w[2].z
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: store i32 [[X]], ptr %dst
+  %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40
+  %w_load = load i32, ptr addrspace(2) %w_gep, align 4
+  store i32 %w_load, ptr %dst, align 4
+
+  ;; v[2].q
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84
+  %v_load = load i32, ptr addrspace(2) %v_gep, align 4
+  %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %v_load, ptr %v.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll
new file mode 100644
index 0000000000000..615fc5ea07eca
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Test for when we have indices into both the array and the vector: ie, s[1][3]
+
+; cbuffer CB : register(b0) {
+;   uint4 s[3]; // offset   0,  size 16        * 3
+; }
+%__cblayout_CB = type <{ [2 x <4 x i32>] }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; s[1][3]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: store i32 [[X]], ptr %dst
+  %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28
+  %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4
+  store i32 %i8_vecext, ptr %dst, align 4
+
+  ;; s[2].w
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ;;
+  ;; It would be nice to avoid the redundant vector creation here, but that's
+  ;; outside of the scope of this pass.
+  ;;
+  ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3
+  ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3
+  ; CHECK: store i32 [[X_EXT]], ptr %dst
+  %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2
+  %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16
+  %typed_vecext = extractelement <4 x i32> %typed_load, i32 3
+  store i32 %typed_vecext, ptr %dst, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll
new file mode 100644
index 0000000000000..eabc07c2fbb68
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB : register(b0) {
+;   float a1[3];
+; }
+%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }>
+
+@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ;; a1[1]
+  ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`.
+  ;
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[X]], ptr %dst
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1
+  %a1 = load float, ptr addrspace(2) %a1_gep, align 4
+  store float %a1, ptr %dst, align 32
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll
new file mode 100644
index 0000000000000..6f6166e820a6f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll
@@ -0,0 +1,145 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB : register(b0) {
+;   float a1[3];        // offset   0,  size 4  (+12) * 3
+;   double3 a2[2];      // offset   48, size 24  (+8) * 2
+;   float16_t a3[2][2]; // offset  112, size  2 (+14) * 4
+;   uint64_t a4[3];     // offset  176, size  8  (+8) * 3
+;   int4 a5[2][3][4];   // offset  224, size 16       * 24
+;   uint16_t a6[1];     // offset  608, size  2 (+14) * 1
+;   int64_t a7[2];      // offset  624, size  8  (+8) * 2
+;   bool a8[4];         // offset  656, size  4 (+12) * 4
+; }
+%__cblayout_CB = type <{
+  <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12),
+  <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8),
+  <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14),
+  <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8),
+  [24 x <4 x i32>],
+  [1 x i16], target("dx.Padding", 14),
+  <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8),
+  <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; a1[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[X]], ptr %dst
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16
+  %a1 = load float, ptr addrspace(2) %a1_gep, align 4
+  store float %a1, ptr %dst, align 32
+
+  ;; a2[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5)
+  ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6)
+  ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]]
+  %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32
+  %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  store <3 x double> %a2, ptr %a2.i, align 32
+
+  ;; a3[0][1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8)
+  ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32
+  ; CHECK: store half [[X]], ptr [[PTR]]
+  %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112)
+  %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16
+  %a3 = load half, ptr addrspace(2) %a3_gep, align 2
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32
+  store half %a3, ptr %a3.i, align 2
+
+  ;; a4[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  ; CHECK: store i64 [[X]], ptr [[PTR]]
+  %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176)
+  %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16
+  %a4 = load i64, ptr addrspace(2) %a4_gep, align 8
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  store i64 %a4, ptr %a4.i, align 8
+
+  ;; a5[1][0][0]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2
+  ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]]
+  %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224)
+  %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192
+  %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  store <4 x i32> %a5, ptr %a5.i, align 4
+
+  ;; a6[0]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38)
+  ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64
+  ; CHECK: store i16 [[X]], ptr [[PTR]]
+  %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608)
+  %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64
+  store i16 %a6, ptr %a6.i, align 2
+
+  ;; a7[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  ; CHECK: store i64 [[X]], ptr [[PTR]]
+  %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624)
+  %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16
+  %a7 = load i64, ptr addrspace(2) %a7_gep, align 8
+  %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  store i64 %a7, ptr %a7.i, align 8
+
+  ;; a8[1]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656)
+  %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16
+  %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1
+  %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80
+  store i32 %a8, ptr %a8.i, align 4
+
+  ret void
+}
+
+!0 = !{i32 0, i32 2}
+!1 = !{}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll
new file mode 100644
index 0000000000000..22994cfc3f48a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for indexed types in dynamically indexed arrays in cbuffers.
+;
+; Bug https://github.com/llvm/llvm-project/issues/164517
+; XFAIL: *
+;
+; struct S {
+;   float x[2];
+;   uint q;
+; };
+; cbuffer CB : register(b0) {
+;   uint32_t3 w[3]; // offset  0, size 12 (+4) * 3
+;   S v[3];         // offset 48, size 24 (+8) * 3
+; }
+%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }>
+%__cblayout_CB = type <{
+  <{
+    [2 x <{ <3 x i32>, target("dx.Padding", 4) }>],
+    <3 x i32>
+  }>,
+  target("dx.Padding", 4),
+  <{
+    [2 x <{ %S, target("dx.Padding", 8) }>], %S
+  }>
+}>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; w[idx].z
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: store i32 [[X]], ptr %dst
+  %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx
+  %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4
+  %w_load = load i32, ptr addrspace(2) %w_gep, align 4
+  store i32 %w_load, ptr %dst, align 4
+
+  ;; v[idx].q
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx
+  %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8
+  %v_load = load i32, ptr addrspace(2) %v_gep, align 4
+  %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %v_load, ptr %v.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll
new file mode 100644
index 0000000000000..7daebaed70442
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+;
+; Tests for dynamic indices into arrays in cbuffers.
+
+; cbuffer CB : register(b0) {
+;   uint s[10]; // offset   0,  size  4 (+12) * 10
+;   uint t[12]; // offset 160,  size  4 (+12) * 12
+; }
+%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst, i32 %idx) {
+entry:
+  %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
+
+  ;; s[idx]
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: store i32 [[X]], ptr %dst
+  %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx
+  %s_load = load i32, ptr addrspace(2) %s_gep, align 4
+  store i32 %s_load, ptr %dst, align 4
+
+  ;; t[idx]
+  ;
+  ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]])
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[X]], ptr [[PTR]]
+  %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160)
+  %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx
+  %t_load = load i32, ptr addrspace(2) %t_gep, align 4
+  %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %t_load, ptr %t.i, align 4
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll
new file mode 100644
index 0000000000000..65c9a3ec966e9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB {
+;   float a1;     // offset  0, size  4
+;   int a2;       // offset  4, size  4
+;   bool a3;      // offset  8, size  4
+;   float16_t a4; // offset 12, size  2
+;   uint16_t a5;  // offset 14, size  2
+;   double a6;    // offset 16, size  8
+;   int64_t a7;   // offset 24, size  8
+; }
+%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+
+  ;; a1
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: store float [[A1]], ptr %dst
+  %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1 = load float, ptr addrspace(2) %a1_ptr, align 4
+  store float %a1, ptr %dst, align 8
+
+  ;; a2
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  ; CHECK: store i32 [[A2]], ptr [[PTR]]
+  %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4)
+  %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4
+  store i32 %a2, ptr %a2.i, align 8
+
+  ;; a3
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  ; CHECK: store i32 [[A3]], ptr [[PTR]]
+  %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8)
+  %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8
+  store i32 %a3, ptr %a3.i, align 4
+
+  ;; a4
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12
+  ; CHECK: store half [[A4]], ptr [[PTR]]
+  %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12)
+  %a4 = load half, ptr addrspace(2) %a4_ptr, align 2
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12
+  store half %a4, ptr %a4.i, align 4
+
+  ;; a5
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14
+  ; CHECK: store i16 [[A5]], ptr [[PTR]]
+  %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14)
+  %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14
+  store i16 %a5, ptr %a5.i, align 2
+
+  ;; a6
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  ; CHECK: store double [[A6]], ptr [[PTR]]
+  %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16)
+  %a6 = load double, ptr addrspace(2) %a6_ptr, align 8
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  store double %a6, ptr %a6.i, align 8
+
+  ;; a7
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24
+  ; CHECK: store i64 [[A7]], ptr [[PTR]]
+  %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24)
+  %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8
+  %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24
+  store i64 %a7, ptr %a7.i, align 8
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll
new file mode 100644
index 0000000000000..0156a1a0472ab
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll
@@ -0,0 +1,121 @@
+; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s
+
+; cbuffer CB {
+;   float3 a1;     // offset   0, size 12 (+4)
+;   double3 a2;    // offset  16, size 24
+;   float16_t2 a3; // offset  40, size  4 (+4)
+;   uint64_t3 a4;  // offset  48, size 24 (+8)
+;   int4 a5;       // offset  80, size 16
+;   uint16_t3 a6;  // offset  96, size  6
+; };
+%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }>
+
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+
+; CHECK: define void @f
+define void @f(ptr %dst) {
+entry:
+  %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4
+
+  ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb
+  %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8
+
+  ;; a1
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0)
+  ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2
+  ; CHECK: store <3 x float> [[VEC2]], ptr %dst
+  %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0)
+  %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16
+  store <3 x float> %a1, ptr %dst, align 4
+
+  ;; a2
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1)
+  ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]]
+  %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16)
+  %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32
+  %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16
+  store <3 x double> %a2, ptr %a2.i, align 8
+
+  ;; a3
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2)
+  ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4
+  ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5
+  ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]]
+  %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40)
+  %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4
+  %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40
+  store <2 x half> %a3, ptr %a3.i, align 2
+
+  ;; a4
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3)
+  ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1
+  ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4)
+  ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]]
+  %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48)
+  %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32
+  %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48
+  store <3 x i64> %a4, ptr %a4.i, align 8
+
+  ;; a5
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5)
+  ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2
+  ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3
+  ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2
+  ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]]
+  %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80)
+  %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16
+  %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72
+  store <4 x i32> %a5, ptr %a5.i, align 4
+
+  ;; a6
+  ;
+  ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6)
+  ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0
+  ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1
+  ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2
+  ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0
+  ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1
+  ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2
+  ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88
+  ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]]
+  %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96)
+  %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8
+  %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88
+  store <3 x i16> %a6, ptr %a6.i, align 2
+
+  ret void
+}

From 793ab6a80d22a2357e5fdbc1e9f51d0ed8dd5a4f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 12:34:00 -0800
Subject: [PATCH 03/97] X86: Enable terminal rule (#165957)

---
 llvm/lib/Target/X86/X86Subtarget.h            |    2 +
 llvm/test/CodeGen/X86/3addr-16bit.ll          |   48 +-
 llvm/test/CodeGen/X86/atomic-rm-bit-test.ll   |   22 +-
 .../CodeGen/X86/atomicrmw-fadd-fp-vector.ll   |    3 +-
 llvm/test/CodeGen/X86/bitcast-vector-bool.ll  |   32 +-
 .../X86/coalescer-dead-flag-verifier-error.ll |    4 +-
 llvm/test/CodeGen/X86/fold-loop-of-urem.ll    |   81 +-
 llvm/test/CodeGen/X86/freeze-binary.ll        |   26 +-
 llvm/test/CodeGen/X86/i128-mul.ll             |  178 +-
 llvm/test/CodeGen/X86/icmp-abs-C.ll           |   22 +-
 .../test/CodeGen/X86/masked_gather_scatter.ll |   12 +-
 llvm/test/CodeGen/X86/midpoint-int.ll         |   28 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |    3 +-
 llvm/test/CodeGen/X86/mul-constant-i16.ll     |    8 +-
 llvm/test/CodeGen/X86/mul-constant-i32.ll     |   16 +-
 llvm/test/CodeGen/X86/mul-constant-i8.ll      |    4 +-
 llvm/test/CodeGen/X86/optimize-max-0.ll       |  211 +-
 llvm/test/CodeGen/X86/parity.ll               |   30 +-
 llvm/test/CodeGen/X86/pr166744.ll             |   14 +-
 llvm/test/CodeGen/X86/rotate-extract.ll       |    4 +-
 llvm/test/CodeGen/X86/smul_fix.ll             |    8 +-
 llvm/test/CodeGen/X86/sshl_sat.ll             |   40 +-
 llvm/test/CodeGen/X86/sshl_sat_vec.ll         |  113 +-
 llvm/test/CodeGen/X86/stackmap.ll             |    9 +-
 .../subvectorwise-store-of-vector-splat.ll    |  210 +-
 llvm/test/CodeGen/X86/twoaddr-lea.ll          |    2 +-
 llvm/test/CodeGen/X86/umul_fix.ll             |    8 +-
 llvm/test/CodeGen/X86/ushl_sat.ll             |   28 +-
 llvm/test/CodeGen/X86/ushl_sat_vec.ll         |  111 +-
 .../CodeGen/X86/vector-mulfix-legalize.ll     |   34 +-
 .../CodeGen/X86/vector-reduce-xor-bool.ll     |  160 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 6081 ++++++++---------
 .../X86/wide-scalar-shift-legalization.ll     | 1344 ++--
 ...ad-of-small-alloca-with-zero-upper-half.ll |  328 +-
 .../CodeGen/X86/widen-load-of-small-alloca.ll |   95 +-
 llvm/test/CodeGen/X86/x86-shrink-wrapping.ll  |   18 +-
 llvm/test/CodeGen/X86/xor.ll                  |  132 +-
 .../LoopStrengthReduce/X86/ivchain-X86.ll     |   21 +-
 38 files changed, 4728 insertions(+), 4762 deletions(-)

diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 868f41375b96b..4f5aadca361fe 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -419,6 +419,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
+  bool enableTerminalRule() const override { return true; }
+
   bool enableEarlyIfConversion() const override;
 
   void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll
index c9390d91d59c2..2b692bff0461e 100644
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    incl %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    incl %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB0_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB0_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test1:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    incl %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB0_2
@@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test2:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    decl %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    decl %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB1_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB1_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB1_2
@@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test3:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addl $2, %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    addl $2, %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB2_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB2_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test3:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $2, %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB2_2
@@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-LABEL: test4:
 ; X64:       ## %bb.0: ## %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    cmpw %di, %si
+; X64-NEXT:    addl %edi, %esi
+; X64-NEXT:    cmpw %di, %ax
 ; X64-NEXT:    jne LBB3_2
 ; X64-NEXT:  ## %bb.1: ## %bb
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movzwl %ax, %ebx
+; X64-NEXT:    movzwl %si, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq _foo
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ; X64-NEXT:  LBB3_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %si, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test4:
@@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    cmpw %cx, %dx
 ; X86-NEXT:    jne LBB3_2
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index b4d40fee01e41..71887e369bd18 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
 ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    .p2align 4
 ; X64-NEXT:  .LBB34_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    xorl %edx, %ecx
+; X64-NEXT:    xorl %esi, %ecx
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; X64-NEXT:    # kill: def $ax killed $ax def $eax
@@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    movzwl %ax, %ecx
 ; X64-NEXT:    movw $123, %ax
-; X64-NEXT:    testl %ecx, %edx
+; X64-NEXT:    testl %ecx, %esi
 ; X64-NEXT:    je .LBB34_3
 ; X64-NEXT:  # %bb.4: # %return
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB34_3: # %if.then
-; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %dx, %eax
 ; X64-NEXT:    movzwl (%rdi,%rax,2), %eax
 ; X64-NEXT:    retq
 entry:
@@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    andb $15, %cl
-; X64-NEXT:    movl $1, %edx
-; X64-NEXT:    shll %cl, %edx
+; X64-NEXT:    movl $1, %esi
+; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movl $-2, %r8d
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %r8d
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    .p2align 4
@@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X64-NEXT:    jne .LBB52_1
 ; X64-NEXT:  # %bb.2: # %atomicrmw.end
 ; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    testl %eax, %edx
+; X64-NEXT:    testl %eax, %esi
 ; X64-NEXT:    je .LBB52_3
 ; X64-NEXT:  # %bb.4: # %if.then
-; X64-NEXT:    movzwl %si, %eax
+; X64-NEXT:    movzwl %dx, %eax
 ; X64-NEXT:    movzwl (%rdi,%rax,2), %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB52_3:
diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
index 105ee7f82ee79..e118f5dbc1534 100644
--- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
+++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
@@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x
 ; CHECK-NEXT:    orl %edx, %eax
 ; CHECK-NEXT:    lock cmpxchgl %ecx, (%rbx)
 ; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movl %eax, %edx
 ; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    pinsrw $0, %edx, %xmm0
 ; CHECK-NEXT:    pinsrw $0, %eax, %xmm1
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    jne .LBB0_1
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 86d7df0c2d648..fae1ff90dd8d5 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
 define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
 ; SSE-LABEL: bitcast_v16i8_to_v2i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
 ;
 ; AVX12-LABEL: bitcast_v16i8_to_v2i8:
 ; AVX12:       # %bb.0:
-; AVX12-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX12-NEXT:    movl %ecx, %eax
+; AVX12-NEXT:    vpmovmskb %xmm0, %eax
+; AVX12-NEXT:    movl %eax, %ecx
 ; AVX12-NEXT:    shrl $8, %eax
 ; AVX12-NEXT:    addb %cl, %al
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
@@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; SSE-LABEL: bitcast_v16i16_to_v2i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    shrl $8, %eax
 ; AVX1-NEXT:    addb %cl, %al
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
@@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    shrl $8, %eax
 ; AVX2-NEXT:    addb %cl, %al
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
@@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; SSE-NEXT:    packssdw %xmm3, %xmm2
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    packsswb %xmm2, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %ecx
-; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
 ; SSE-NEXT:    shrl $8, %eax
 ; SSE-NEXT:    addb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    movl %eax, %ecx
 ; AVX1-NEXT:    shrl $8, %eax
 ; AVX1-NEXT:    addb %cl, %al
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
@@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
-; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    shrl $8, %eax
 ; AVX2-NEXT:    addb %cl, %al
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
index 4d41c8406f6e0..a42a715bdc6ab 100644
--- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
+++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
@@ -7,8 +7,8 @@
 define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) {
 ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4
@@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i:        ; preds = %for.body.i.i.i.i.i
 define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) {
 ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    jmp .LBB1_1
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c1beb7c803b2b..c9c88f7258435 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je .LBB17_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    movl $1, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB17_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB17_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB17_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB21_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB21_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB21_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB21_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB22_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    orl $16, %ebx
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB22_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB22_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB22_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
 ; CHECK-NEXT:    cmpl $3, %edi
 ; CHECK-NEXT:    jb .LBB23_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    movl $7, %r15d
+; CHECK-NEXT:    movl $7, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB23_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $5, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB23_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB23_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
@@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
 ; CHECK-NEXT:    cmpl %edx, %edi
 ; CHECK-NEXT:    jbe .LBB25_4
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r15d
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edx, %ebx
+; CHECK-NEXT:    movl %esi, %ebp
 ; CHECK-NEXT:    movl %edi, %r14d
 ; CHECK-NEXT:    negl %r14d
-; CHECK-NEXT:    addl $-2, %r15d
+; CHECK-NEXT:    addl $-2, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB25_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %r15d, %eax
+; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    divl %ebp
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    callq use.i32@PLT
-; CHECK-NEXT:    leal 1(%r14,%r15), %eax
-; CHECK-NEXT:    movl %r15d, %ecx
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    leal 1(%r14,%rax), %eax
 ; CHECK-NEXT:    cmpl $-2, %eax
-; CHECK-NEXT:    movl %ecx, %r15d
 ; CHECK-NEXT:    jne .LBB25_2
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:  .LBB25_4: # %for.cond.cleanup
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index e223765eb887b..46b2571e196bb 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
 define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_ashr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sarl $3, %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $3, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $6, %eax
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_exact_extra_use:
 ; X64:       # %bb.0:
-; X64-NEXT:    sarl $3, %edi
-; X64-NEXT:    movl %edi, (%rsi)
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarl $3, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    sarl $6, %eax
+; X64-NEXT:    movl %ecx, (%rsi)
 ; X64-NEXT:    retq
   %x = ashr exact i32 %a0, 3
   %y = freeze i32 %x
@@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
 define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_lshr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shrl $5, %eax
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_exact_extra_use:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrl $3, %edi
-; X64-NEXT:    movl %edi, (%rsi)
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrl $5, %eax
+; X64-NEXT:    movl %ecx, (%rsi)
 ; X64-NEXT:    retq
   %x = lshr exact i32 %a0, 3
   %y = freeze i32 %x
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index cffd88c55bb0a..477a0dce5c81c 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-NOBMI-NEXT:    orl %ecx, %eax
 ; X86-NOBMI-NEXT:    je .LBB1_3
 ; X86-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:    .p2align 4
 ; X86-NOBMI-NEXT:  .LBB1_2: # %for.body
 ; X86-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax,%ecx,8), %edi
-; X86-NOBMI-NEXT:    movl 4(%eax,%ecx,8), %ebx
+; X86-NOBMI-NEXT:    movl (%eax,%edi,8), %ebp
+; X86-NOBMI-NEXT:    movl 4(%eax,%edi,8), %ebx
 ; X86-NOBMI-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOBMI-NEXT:    movl %edi, %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    movl %edx, %ebp
+; X86-NOBMI-NEXT:    movl %ebp, %eax
+; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    mull %esi
-; X86-NOBMI-NEXT:    movl %edx, %ebx
-; X86-NOBMI-NEXT:    movl %eax, %esi
-; X86-NOBMI-NEXT:    addl %ebp, %esi
-; X86-NOBMI-NEXT:    adcl $0, %ebx
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl $0, %edx
+; X86-NOBMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NOBMI-NEXT:    movl %ebp, %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    mull %edx
-; X86-NOBMI-NEXT:    movl %edx, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    addl %esi, %edi
-; X86-NOBMI-NEXT:    adcl %ebx, %ebp
-; X86-NOBMI-NEXT:    setb %bl
+; X86-NOBMI-NEXT:    movl %eax, %ebp
+; X86-NOBMI-NEXT:    addl %ebx, %ebp
+; X86-NOBMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    movl %edx, %ebx
+; X86-NOBMI-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOBMI-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    addl %ebp, %eax
-; X86-NOBMI-NEXT:    movzbl %bl, %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOBMI-NEXT:    adcl %esi, %edx
-; X86-NOBMI-NEXT:    movl %ecx, %ebx
-; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl $0, %eax
-; X86-NOBMI-NEXT:    adcl $0, %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %ecx, (%esi,%ebx,8)
-; X86-NOBMI-NEXT:    movl %ebx, %ecx
-; X86-NOBMI-NEXT:    movl %edi, 4(%esi,%ebx,8)
-; X86-NOBMI-NEXT:    addl $1, %ecx
-; X86-NOBMI-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NOBMI-NEXT:    adcl $0, %edi
-; X86-NOBMI-NEXT:    movl %ecx, %esi
-; X86-NOBMI-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NOBMI-NEXT:    xorl %ebp, %edi
-; X86-NOBMI-NEXT:    orl %esi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    addl %ebx, %esi
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    movzbl (%esp), %ebx # 1-byte Folded Reload
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    adcl %ebx, %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NOBMI-NEXT:    adcl %eax, %ebp
+; X86-NOBMI-NEXT:    adcl $0, %esi
+; X86-NOBMI-NEXT:    adcl $0, %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %edx, (%eax,%edi,8)
+; X86-NOBMI-NEXT:    movl %ebp, 4(%eax,%edi,8)
+; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    addl $1, %edi
+; X86-NOBMI-NEXT:    adcl $0, %ebp
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %eax
+; X86-NOBMI-NEXT:    movl %ebp, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %edx
+; X86-NOBMI-NEXT:    orl %eax, %edx
 ; X86-NOBMI-NEXT:    jne .LBB1_2
 ; X86-NOBMI-NEXT:  .LBB1_3: # %for.end
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
@@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $20, %esp
+; X86-BMI-NEXT:    subl $16, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    orl %ecx, %eax
 ; X86-BMI-NEXT:    je .LBB1_3
 ; X86-BMI-NEXT:  # %bb.1: # %for.body.preheader
-; X86-BMI-NEXT:    xorl %ecx, %ecx
-; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    xorl %esi, %esi
+; X86-BMI-NEXT:    xorl %edi, %edi
 ; X86-BMI-NEXT:    xorl %ebx, %ebx
-; X86-BMI-NEXT:    xorl %ebp, %ebp
+; X86-BMI-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-BMI-NEXT:    .p2align 4
 ; X86-BMI-NEXT:  .LBB1_2: # %for.body
 ; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl (%eax,%ebx,8), %ecx
-; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %esi
-; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %ebp
+; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    mulxl %eax, %edx, %edi
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %edx, %eax
+; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl %esi, %edx
-; X86-BMI-NEXT:    mulxl %eax, %esi, %eax
-; X86-BMI-NEXT:    addl %edi, %esi
-; X86-BMI-NEXT:    adcl $0, %eax
+; X86-BMI-NEXT:    movl %ebp, %edx
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %ebp
+; X86-BMI-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl $0, %ebp
 ; X86-BMI-NEXT:    movl %ecx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    mulxl %ecx, %edi, %ebp
-; X86-BMI-NEXT:    addl %esi, %edi
-; X86-BMI-NEXT:    adcl %eax, %ebp
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %ecx, %edx
+; X86-BMI-NEXT:    addl %eax, %ecx
+; X86-BMI-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl %esi, %eax
+; X86-BMI-NEXT:    adcl %ebp, %edx
+; X86-BMI-NEXT:    movl %edx, %ebp
 ; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-BMI-NEXT:    mulxl %ecx, %ecx, %eax
+; X86-BMI-NEXT:    mulxl {{[0-9]+}}(%esp), %esi, %edi
 ; X86-BMI-NEXT:    setb %dl
-; X86-BMI-NEXT:    addl %ebp, %ecx
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    addl %ebp, %esi
 ; X86-BMI-NEXT:    movzbl %dl, %edx
-; X86-BMI-NEXT:    adcl %edx, %eax
-; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl $0, %ecx
-; X86-BMI-NEXT:    adcl $0, %edx
-; X86-BMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %eax, (%edx,%ebx,8)
-; X86-BMI-NEXT:    movl %edi, 4(%edx,%ebx,8)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    adcl %edx, %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-BMI-NEXT:    addl %eax, %edx
+; X86-BMI-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-BMI-NEXT:    adcl $0, %esi
+; X86-BMI-NEXT:    adcl $0, %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %edx, (%eax,%ebx,8)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax,%ebx,8)
 ; X86-BMI-NEXT:    addl $1, %ebx
-; X86-BMI-NEXT:    adcl $0, %ebp
-; X86-BMI-NEXT:    movl %ebx, %edx
-; X86-BMI-NEXT:    xorl %esi, %edx
-; X86-BMI-NEXT:    movl %ebp, %esi
-; X86-BMI-NEXT:    xorl %edi, %esi
-; X86-BMI-NEXT:    orl %edx, %esi
-; X86-BMI-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    movl %ebx, %eax
+; X86-BMI-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    xorl %ebp, %ecx
+; X86-BMI-NEXT:    orl %eax, %ecx
 ; X86-BMI-NEXT:    jne .LBB1_2
 ; X86-BMI-NEXT:  .LBB1_3: # %for.end
 ; X86-BMI-NEXT:    xorl %eax, %eax
 ; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    addl $20, %esp
+; X86-BMI-NEXT:    addl $16, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-NOBMI-NEXT:    je .LBB1_3
 ; X64-NOBMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X64-NOBMI-NEXT:    movq %rdx, %r8
-; X64-NOBMI-NEXT:    xorl %r10d, %r10d
+; X64-NOBMI-NEXT:    xorl %edx, %edx
 ; X64-NOBMI-NEXT:    xorl %r9d, %r9d
 ; X64-NOBMI-NEXT:    .p2align 4
 ; X64-NOBMI-NEXT:  .LBB1_2: # %for.body
 ; X64-NOBMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NOBMI-NEXT:    movq %rdx, %r10
 ; X64-NOBMI-NEXT:    movq %rcx, %rax
 ; X64-NOBMI-NEXT:    mulq (%r8,%r9,8)
 ; X64-NOBMI-NEXT:    addq %r10, %rax
@@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-NOBMI-NEXT:    movq %rax, (%rsi,%r9,8)
 ; X64-NOBMI-NEXT:    incq %r9
 ; X64-NOBMI-NEXT:    cmpq %r9, %rdi
-; X64-NOBMI-NEXT:    movq %rdx, %r10
 ; X64-NOBMI-NEXT:    jne .LBB1_2
 ; X64-NOBMI-NEXT:  .LBB1_3: # %for.end
 ; X64-NOBMI-NEXT:    xorl %eax, %eax
@@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-BMI-NEXT:    je .LBB1_3
 ; X64-BMI-NEXT:  # %bb.1: # %for.body.preheader
 ; X64-BMI-NEXT:    movq %rdx, %rax
-; X64-BMI-NEXT:    xorl %r9d, %r9d
+; X64-BMI-NEXT:    xorl %edx, %edx
 ; X64-BMI-NEXT:    xorl %r8d, %r8d
 ; X64-BMI-NEXT:    .p2align 4
 ; X64-BMI-NEXT:  .LBB1_2: # %for.body
 ; X64-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-BMI-NEXT:    movq %rdx, %r9
 ; X64-BMI-NEXT:    movq %rcx, %rdx
 ; X64-BMI-NEXT:    mulxq (%rax,%r8,8), %r10, %rdx
 ; X64-BMI-NEXT:    addq %r9, %r10
@@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X64-BMI-NEXT:    movq %r10, (%rsi,%r8,8)
 ; X64-BMI-NEXT:    incq %r8
 ; X64-BMI-NEXT:    cmpq %r8, %rdi
-; X64-BMI-NEXT:    movq %rdx, %r9
 ; X64-BMI-NEXT:    jne .LBB1_2
 ; X64-BMI-NEXT:  .LBB1_3: # %for.end
 ; X64-BMI-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll
index 53b70fa38958b..c98889b7d5cb3 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll
@@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind {
 ; X86-LABEL: ne_and_with_dom_abs:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movswl %cx, %eax
-; X86-NEXT:    sarl $15, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    sarl $15, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
 ; X86-NEXT:    movzwl %ax, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpw $64, %cx
-; X86-NEXT:    setne %cl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpw $64, %dx
+; X86-NEXT:    setne %dl
 ; X86-NEXT:    cmpl $2345, %esi # imm = 0x929
 ; X86-NEXT:    jae .LBB3_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movb %cl, %dl
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB3_2:
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index caec02eaa19c7..2f691e7ca8f5b 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -255,9 +255,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
 ; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-KNL-NEXT:    kmovw %k1, %k2
 ; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
-; X64-KNL-NEXT:    vmovdqa64 %zmm1, %zmm2
-; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
-; X64-KNL-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm2
+; X64-KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; X64-KNL-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test7:
@@ -271,9 +271,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
 ; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
-; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm2
-; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
-; X86-KNL-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm2
+; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; X86-KNL-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index a75d42ed0c50f..c058e37e0ce11 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setbe %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i16_signed_reg_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
@@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i16_signed_mem_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    subw %dx, %ax
 ; X86-NEXT:    setle %bl
 ; X86-NEXT:    leal -1(%ebx,%ebx), %edx
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 73d459ba77026..8f97d2652bc53 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    je .LBB3_1
 ; X86-NEXT:  # %bb.2: # %bb26.preheader
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_3: # %bb26
@@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
 ; X86-NEXT:    jb .LBB3_3
 ; X86-NEXT:    jmp .LBB3_4
 ; X86-NEXT:  .LBB3_1:
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:  .LBB3_4: # %bb31
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index b1aa789e53cd7..a663f6a1dd376 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    shll $6, %edi
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 66
@@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $9, %eax
-; X64-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-NEXT:    shll $9, %edi
+; X64-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 520
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index 79889b9ace406..4129b44ed3ddc 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) {
 ; X64-HSW:       # %bb.0:
 ; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-HSW-NEXT:    movl %edi, %eax
-; X64-HSW-NEXT:    shll $6, %eax
-; X64-HSW-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-HSW-NEXT:    shll $6, %edi
+; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-HSW-NEXT:    retq
 ;
 ; X64-JAG-LABEL: test_mul_by_66:
 ; X64-JAG:       # %bb.0:
 ; X64-JAG-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-JAG-NEXT:    movl %edi, %eax
-; X64-JAG-NEXT:    shll $6, %eax
-; X64-JAG-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-JAG-NEXT:    shll $6, %edi
+; X64-JAG-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-JAG-NEXT:    retq
 ;
 ; X86-NOOPT-LABEL: test_mul_by_66:
@@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) {
 ; X64-HSW:       # %bb.0:
 ; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-HSW-NEXT:    movl %edi, %eax
-; X64-HSW-NEXT:    shll $9, %eax
-; X64-HSW-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-HSW-NEXT:    shll $9, %edi
+; X64-HSW-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-HSW-NEXT:    retq
 ;
 ; X64-JAG-LABEL: test_mul_by_520:
 ; X64-JAG:       # %bb.0:
 ; X64-JAG-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-JAG-NEXT:    movl %edi, %eax
-; X64-JAG-NEXT:    shll $9, %eax
-; X64-JAG-NEXT:    leal (%rax,%rdi,8), %eax
+; X64-JAG-NEXT:    shll $9, %edi
+; X64-JAG-NEXT:    leal (%rdi,%rax,8), %eax
 ; X64-JAG-NEXT:    retq
 ;
 ; X86-NOOPT-LABEL: test_mul_by_520:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll
index a4fa1ee8c0029..b488653655728 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i8.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll
@@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    shll $6, %edi
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 66
diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index 283c00e17f21a..b6af7e1641a9c 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    imull %ebp, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    imull %esi, %eax
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    sarl $31, %eax
-; CHECK-NEXT:    shrl $30, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    sarl $2, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    sarl $31, %ebp
+; CHECK-NEXT:    shrl $30, %ebp
+; CHECK-NEXT:    addl %eax, %ebp
+; CHECK-NEXT:    sarl $2, %ebp
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.2: ## %bb.nph9
-; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.3: ## %bb.nph9.split
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    incl %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_4: ## %bb6
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movzbl (%eax,%esi,2), %ebx
-; CHECK-NEXT:    movb %bl, (%edx,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %ebp, %esi
+; CHECK-NEXT:    movzbl (%eax,%edi,2), %ebx
+; CHECK-NEXT:    movb %bl, (%edx,%edi)
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    cmpl %esi, %edi
 ; CHECK-NEXT:    jl LBB0_4
 ; CHECK-NEXT:  ## %bb.5: ## %bb9
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    cmpl %edi, %ecx
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    je LBB0_12
 ; CHECK-NEXT:  ## %bb.6: ## %bb7.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    jmp LBB0_4
 ; CHECK-NEXT:  LBB0_12: ## %bb18.loopexit
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %ebp, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    jle LBB0_13
 ; CHECK-NEXT:  ## %bb.7: ## %bb.nph5
-; CHECK-NEXT:    cmpl $2, %ebp
+; CHECK-NEXT:    cmpl $2, %esi
 ; CHECK-NEXT:    jl LBB0_13
 ; CHECK-NEXT:  ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT:    movl %ebp, %edx
-; CHECK-NEXT:    shrl $31, %edx
-; CHECK-NEXT:    addl %ebp, %edx
-; CHECK-NEXT:    sarl %edx
+; CHECK-NEXT:    movl %esi, %ebp
+; CHECK-NEXT:    shrl $31, %ebp
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    sarl %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $31, %ecx
@@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_10 Depth 2
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %edx, %edi
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%ecx,%esi)
-; CHECK-NEXT:    movzbl (%edi,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%eax,%esi)
-; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %edx, %esi
+; CHECK-NEXT:    movzbl -2(%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%ecx,%ebx)
+; CHECK-NEXT:    movzbl (%edi,%ebx,4), %edx
+; CHECK-NEXT:    movb %dl, (%eax,%ebx)
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    cmpl %ebp, %ebx
 ; CHECK-NEXT:    jl LBB0_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB0_9 Depth=1
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %ebp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    addl %ebp, %ecx
 ; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; CHECK-NEXT:    jl LBB0_9
 ; CHECK-NEXT:  LBB0_13: ## %bb20
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb20
-; CHECK-NEXT:    cmpl $3, %eax
+; CHECK-NEXT:    cmpl $3, %ecx
 ; CHECK-NEXT:    jne LBB0_24
 ; CHECK-NEXT:  ## %bb.15: ## %bb22
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; CHECK-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_18
 ; CHECK-NEXT:  ## %bb.16: ## %bb.nph
-; CHECK-NEXT:    leal 15(%edi), %eax
+; CHECK-NEXT:    leal 15(%edx), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebx, %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    addl %ecx, %ebx
-; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    leal 15(%ebp), %eax
+; CHECK-NEXT:    addl %ebp, %ebp
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    addl %edi, %ecx
+; CHECK-NEXT:    addl %ecx, %ebp
+; CHECK-NEXT:    addl %eax, %ebx
+; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_17: ## %bb23
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    movl %ebx, %esi
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    movl %ebx, %ebp
 ; CHECK-NEXT:    movl %edx, %ebx
 ; CHECK-NEXT:    calll _memcpy
 ; CHECK-NEXT:    movl %ebx, %edx
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %ebp, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %ebp, %ebx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    decl %edx
 ; CHECK-NEXT:    jne LBB0_17
 ; CHECK-NEXT:  LBB0_18: ## %bb26
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:    addl %ecx, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %esi, %edx
 ; CHECK-NEXT:    jmp LBB0_23
 ; CHECK-NEXT:  LBB0_19: ## %bb29
-; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    testl %edx, %edx
 ; CHECK-NEXT:    jle LBB0_22
 ; CHECK-NEXT:  ## %bb.20: ## %bb.nph11
-; CHECK-NEXT:    movl %edi, %esi
-; CHECK-NEXT:    leal 15(%ebp), %eax
+; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:  LBB0_21: ## %bb30
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    movl %ebx, %ebp
 ; CHECK-NEXT:    movl %edx, %ebx
 ; CHECK-NEXT:    calll _memcpy
 ; CHECK-NEXT:    movl %ebx, %edx
+; CHECK-NEXT:    movl %ebp, %ebx
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %ebp, %edi
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %esi
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    decl %edx
 ; CHECK-NEXT:    jne LBB0_21
 ; CHECK-NEXT:  LBB0_22: ## %bb33
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %ecx, %edx
 ; CHECK-NEXT:  LBB0_23: ## %bb33
-; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    sarl %eax
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $128
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $44, %esp
 ; CHECK-NEXT:  LBB0_25: ## %return
@@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB1_10 Depth 2
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    andl $1, %ebx
 ; CHECK-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; CHECK-NEXT:    addl %edx, %ebx
-; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB1_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%ebx,%esi,4), %edx
-; CHECK-NEXT:    movb %dl, (%eax,%esi)
-; CHECK-NEXT:    movzbl (%ebx,%esi,4), %edx
-; CHECK-NEXT:    movb %dl, (%ecx,%esi)
+; CHECK-NEXT:    movzbl -2(%edx,%esi,4), %ebx
+; CHECK-NEXT:    movb %bl, (%eax,%esi)
+; CHECK-NEXT:    movzbl (%edx,%esi,4), %ebx
+; CHECK-NEXT:    movb %bl, (%ecx,%esi)
 ; CHECK-NEXT:    incl %esi
 ; CHECK-NEXT:    cmpl %ebp, %esi
 ; CHECK-NEXT:    jb LBB1_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB1_9 Depth=1
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT:    incl %ebx
-; CHECK-NEXT:    addl %ebp, %ecx
 ; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %edx
+; CHECK-NEXT:    incl %edx
+; CHECK-NEXT:    addl %ebp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %esi
 ; CHECK-NEXT:    addl %ebp, %eax
-; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    jb LBB1_9
 ; CHECK-NEXT:  LBB1_13: ## %bb20
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab433..31a7f1125150b 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
@@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64_trunc:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
@@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) {
 ;
 ; X64-NOPOPCNT-LABEL: parity_64_shift:
 ; X64-NOPOPCNT:       # %bb.0:
-; X64-NOPOPCNT-NEXT:    movq %rdi, %rax
-; X64-NOPOPCNT-NEXT:    shrq $32, %rax
-; X64-NOPOPCNT-NEXT:    xorl %edi, %eax
-; X64-NOPOPCNT-NEXT:    movl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    movl %edi, %eax
+; X64-NOPOPCNT-NEXT:    shrq $32, %rdi
+; X64-NOPOPCNT-NEXT:    xorl %eax, %edi
+; X64-NOPOPCNT-NEXT:    movl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    shrl $16, %ecx
-; X64-NOPOPCNT-NEXT:    xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT:    xorl %edi, %ecx
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
index 21b25d87796a5..ffdb68c7a6c01 100644
--- a/llvm/test/CodeGen/X86/pr166744.ll
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -31,13 +31,13 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
 ; NOPOSTRA-LABEL: PR166744:
 ; NOPOSTRA:       # %bb.0:
 ; NOPOSTRA-NEXT:    movl %esi, %eax
-; NOPOSTRA-NEXT:    shrl $3, %eax
-; NOPOSTRA-NEXT:    andl $60, %eax
-; NOPOSTRA-NEXT:    movl (%rdi,%rax), %ecx
-; NOPOSTRA-NEXT:    btrl %esi, %ecx
-; NOPOSTRA-NEXT:    shlxl %esi, %edx, %edx
-; NOPOSTRA-NEXT:    orl %ecx, %edx
-; NOPOSTRA-NEXT:    movl %edx, (%rdi,%rax)
+; NOPOSTRA-NEXT:    shrl $3, %esi
+; NOPOSTRA-NEXT:    andl $60, %esi
+; NOPOSTRA-NEXT:    movl (%rdi,%rsi), %ecx
+; NOPOSTRA-NEXT:    btrl %eax, %ecx
+; NOPOSTRA-NEXT:    shlxl %eax, %edx, %eax
+; NOPOSTRA-NEXT:    orl %ecx, %eax
+; NOPOSTRA-NEXT:    movl %eax, (%rdi,%rsi)
 ; NOPOSTRA-NEXT:    movq 16(%rdi), %rax
 ; NOPOSTRA-NEXT:    movq (%rdi), %rcx
 ; NOPOSTRA-NEXT:    movq 8(%rdi), %rdx
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 8f046a4f5aea5..26e68861cf45c 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind {
 ; X64-LABEL: no_extract_mul:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shll $8, %edi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %ecx
+; X64-NEXT:    leal (%rax,%rax,8), %eax
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    shrl $9, %eax
 ; X64-NEXT:    orl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283df6010..8cb032776114b 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -10,10 +10,10 @@ declare  <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    movslq %esi, %rax
-; X64-NEXT:    movslq %edi, %rcx
-; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movslq %esi, %rcx
+; X64-NEXT:    movslq %edi, %rax
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrq $32, %rax
 ; X64-NEXT:    shldl $30, %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index e5ea911d4771a..a93be22bf5861 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movswl %dx, %esi
+; X64-NEXT:    shll %cl, %edi
+; X64-NEXT:    movswl %di, %esi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarl %cl, %esi
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testw %di, %di
+; X64-NEXT:    testw %dx, %dx
 ; X64-NEXT:    sets %al
 ; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X64-NEXT:    cmpw %si, %di
-; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    cmpw %si, %dx
+; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
@@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movswl %dx, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
-; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    addl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl %eax, %edx
+; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    testw %ax, %ax
-; X64-NEXT:    sets %dl
-; X64-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    movswl %si, %edi
+; X64-NEXT:    sets %sil
+; X64-NEXT:    addl $32767, %esi # imm = 0x7FFF
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movswl %ax, %edi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    cmpw %di, %ax
-; X64-NEXT:    cmovnel %edx, %esi
-; X64-NEXT:    movswl %si, %eax
+; X64-NEXT:    cmpw %di, %dx
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    cwtl
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 10dee14bdd1a0..ff76707bdbb69 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movswl %bx, %ebp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movswl %di, %ebp
 ; X86-NEXT:    sarl %cl, %ebp
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %di, %di
+; X86-NEXT:    testw %bx, %bx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %bp, %di
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovel %ebx, %ecx
+; X86-NEXT:    cmpw %bp, %bx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    cmovel %edi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movswl %di, %ebx
-; X86-NEXT:    sarl %cl, %ebx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %si, %si
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %bx, %si
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %bx, %bx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmpw %di, %bx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    cmovel %esi, %ebp
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movswl %dx, %esi
 ; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %di, %di
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %di
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edx
+; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %si, %si
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %dx, %si
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    cmovel %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %esi
-; X86-NEXT:    sarl %cl, %esi
+; X86-NEXT:    movswl %dx, %eax
+; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmpw %ax, %si
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovel %edx, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %esi
 ; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmovel %edx, %ebx
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %dl
 ; X86-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movswl %ax, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    testw %si, %si
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movw %dx, 12(%eax)
 ; X86-NEXT:    movw %bx, 10(%eax)
-; X86-NEXT:    movw %bp, 8(%eax)
 ; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, 6(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movw %bp, 2(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    addl $16, %esp
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 72406aaa4efa8..9bf88cb8bdf81 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
+; FIXME: Test should be fixed to produce the correct sized spill with
+; -terminal-rule=0 flag removed
+
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
 ; Header
@@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) {
   ret void
 }
 
-; A stack frame which needs to be realigned at runtime (to meet alignment 
-; criteria for values on the stack) does not have a fixed frame size. 
+; A stack frame which needs to be realigned at runtime (to meet alignment
+; criteria for values on the stack) does not have a fixed frame size.
 ; CHECK-LABEL:  .long L{{.*}}-_needsStackRealignment
 ; CHECK-NEXT:   .short 0
 ; 0 locations
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 5bd624c0697a0..01fbafb18eb9f 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; SSE2-ONLY:       # %bb.0:
 ; SSE2-ONLY-NEXT:    movl (%rdi), %eax
 ; SSE2-ONLY-NEXT:    notl %eax
-; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
 ; SSE2-ONLY-NEXT:    movl %eax, %ecx
-; SSE2-ONLY-NEXT:    shrl $16, %ecx
-; SSE2-ONLY-NEXT:    movb %cl, 2(%rsi)
-; SSE2-ONLY-NEXT:    movb %cl, 2(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, (%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 6(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 10(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 8(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 14(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 18(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 16(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 22(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 26(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 24(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 30(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 34(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 32(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 38(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 42(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 40(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 46(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 50(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 48(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 54(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 58(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 56(%rdx)
-; SSE2-ONLY-NEXT:    movb %cl, 62(%rdx)
-; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
+; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
+; SSE2-ONLY-NEXT:    shrl $16, %eax
+; SSE2-ONLY-NEXT:    movb %al, 2(%rsi)
+; SSE2-ONLY-NEXT:    movb %al, 2(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, (%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 6(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 4(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 10(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 8(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 14(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 12(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 18(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 16(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 22(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 20(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 26(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 24(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 30(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 28(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 34(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 32(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 38(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 36(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 42(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 40(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 46(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 44(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 50(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 48(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 54(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 52(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 58(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 56(%rdx)
+; SSE2-ONLY-NEXT:    movb %al, 62(%rdx)
+; SSE2-ONLY-NEXT:    movw %cx, 60(%rdx)
 ; SSE2-ONLY-NEXT:    retq
 ;
 ; SSE3-LABEL: vec384_v3i8:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movl (%rdi), %eax
 ; SSE3-NEXT:    notl %eax
-; SSE3-NEXT:    movw %ax, (%rsi)
 ; SSE3-NEXT:    movl %eax, %ecx
-; SSE3-NEXT:    shrl $16, %ecx
-; SSE3-NEXT:    movb %cl, 2(%rsi)
-; SSE3-NEXT:    movb %cl, 2(%rdx)
-; SSE3-NEXT:    movw %ax, (%rdx)
-; SSE3-NEXT:    movb %cl, 6(%rdx)
-; SSE3-NEXT:    movw %ax, 4(%rdx)
-; SSE3-NEXT:    movb %cl, 10(%rdx)
-; SSE3-NEXT:    movw %ax, 8(%rdx)
-; SSE3-NEXT:    movb %cl, 14(%rdx)
-; SSE3-NEXT:    movw %ax, 12(%rdx)
-; SSE3-NEXT:    movb %cl, 18(%rdx)
-; SSE3-NEXT:    movw %ax, 16(%rdx)
-; SSE3-NEXT:    movb %cl, 22(%rdx)
-; SSE3-NEXT:    movw %ax, 20(%rdx)
-; SSE3-NEXT:    movb %cl, 26(%rdx)
-; SSE3-NEXT:    movw %ax, 24(%rdx)
-; SSE3-NEXT:    movb %cl, 30(%rdx)
-; SSE3-NEXT:    movw %ax, 28(%rdx)
-; SSE3-NEXT:    movb %cl, 34(%rdx)
-; SSE3-NEXT:    movw %ax, 32(%rdx)
-; SSE3-NEXT:    movb %cl, 38(%rdx)
-; SSE3-NEXT:    movw %ax, 36(%rdx)
-; SSE3-NEXT:    movb %cl, 42(%rdx)
-; SSE3-NEXT:    movw %ax, 40(%rdx)
-; SSE3-NEXT:    movb %cl, 46(%rdx)
-; SSE3-NEXT:    movw %ax, 44(%rdx)
-; SSE3-NEXT:    movb %cl, 50(%rdx)
-; SSE3-NEXT:    movw %ax, 48(%rdx)
-; SSE3-NEXT:    movb %cl, 54(%rdx)
-; SSE3-NEXT:    movw %ax, 52(%rdx)
-; SSE3-NEXT:    movb %cl, 58(%rdx)
-; SSE3-NEXT:    movw %ax, 56(%rdx)
-; SSE3-NEXT:    movb %cl, 62(%rdx)
-; SSE3-NEXT:    movw %ax, 60(%rdx)
+; SSE3-NEXT:    movw %ax, (%rsi)
+; SSE3-NEXT:    shrl $16, %eax
+; SSE3-NEXT:    movb %al, 2(%rsi)
+; SSE3-NEXT:    movb %al, 2(%rdx)
+; SSE3-NEXT:    movw %cx, (%rdx)
+; SSE3-NEXT:    movb %al, 6(%rdx)
+; SSE3-NEXT:    movw %cx, 4(%rdx)
+; SSE3-NEXT:    movb %al, 10(%rdx)
+; SSE3-NEXT:    movw %cx, 8(%rdx)
+; SSE3-NEXT:    movb %al, 14(%rdx)
+; SSE3-NEXT:    movw %cx, 12(%rdx)
+; SSE3-NEXT:    movb %al, 18(%rdx)
+; SSE3-NEXT:    movw %cx, 16(%rdx)
+; SSE3-NEXT:    movb %al, 22(%rdx)
+; SSE3-NEXT:    movw %cx, 20(%rdx)
+; SSE3-NEXT:    movb %al, 26(%rdx)
+; SSE3-NEXT:    movw %cx, 24(%rdx)
+; SSE3-NEXT:    movb %al, 30(%rdx)
+; SSE3-NEXT:    movw %cx, 28(%rdx)
+; SSE3-NEXT:    movb %al, 34(%rdx)
+; SSE3-NEXT:    movw %cx, 32(%rdx)
+; SSE3-NEXT:    movb %al, 38(%rdx)
+; SSE3-NEXT:    movw %cx, 36(%rdx)
+; SSE3-NEXT:    movb %al, 42(%rdx)
+; SSE3-NEXT:    movw %cx, 40(%rdx)
+; SSE3-NEXT:    movb %al, 46(%rdx)
+; SSE3-NEXT:    movw %cx, 44(%rdx)
+; SSE3-NEXT:    movb %al, 50(%rdx)
+; SSE3-NEXT:    movw %cx, 48(%rdx)
+; SSE3-NEXT:    movb %al, 54(%rdx)
+; SSE3-NEXT:    movw %cx, 52(%rdx)
+; SSE3-NEXT:    movb %al, 58(%rdx)
+; SSE3-NEXT:    movw %cx, 56(%rdx)
+; SSE3-NEXT:    movb %al, 62(%rdx)
+; SSE3-NEXT:    movw %cx, 60(%rdx)
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-ONLY-LABEL: vec384_v3i8:
 ; SSSE3-ONLY:       # %bb.0:
 ; SSSE3-ONLY-NEXT:    movl (%rdi), %eax
 ; SSSE3-ONLY-NEXT:    notl %eax
-; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
 ; SSSE3-ONLY-NEXT:    movl %eax, %ecx
-; SSSE3-ONLY-NEXT:    shrl $16, %ecx
-; SSSE3-ONLY-NEXT:    movb %cl, 2(%rsi)
-; SSSE3-ONLY-NEXT:    movb %cl, 2(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, (%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 6(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 10(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 8(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 14(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 18(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 16(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 22(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 26(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 24(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 30(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 34(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 32(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 38(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 42(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 40(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 46(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 50(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 48(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 54(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 58(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 56(%rdx)
-; SSSE3-ONLY-NEXT:    movb %cl, 62(%rdx)
-; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
+; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
+; SSSE3-ONLY-NEXT:    shrl $16, %eax
+; SSSE3-ONLY-NEXT:    movb %al, 2(%rsi)
+; SSSE3-ONLY-NEXT:    movb %al, 2(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, (%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 6(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 4(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 10(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 8(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 14(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 12(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 18(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 16(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 22(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 20(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 26(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 24(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 30(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 28(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 34(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 32(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 38(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 36(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 42(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 40(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 46(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 44(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 50(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 48(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 54(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 52(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 58(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 56(%rdx)
+; SSSE3-ONLY-NEXT:    movb %al, 62(%rdx)
+; SSSE3-ONLY-NEXT:    movw %cx, 60(%rdx)
 ; SSSE3-ONLY-NEXT:    retq
 ;
 ; SSE41-LABEL: vec384_v3i8:
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index f20b777531c5a..3ad3e9a0e7655 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -65,10 +65,10 @@ entry:
 define void @ham() {
 ; CHECK-LABEL: ham:
 ; CHECK:       ## %bb.0: ## %bb
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    movq _global@GOTPCREL(%rip), %rdx
 ; CHECK-NEXT:    movq _global2@GOTPCREL(%rip), %rsi
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je LBB3_2
 ; CHECK-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714b49a4d..5a68484596a2f 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -10,10 +10,10 @@ declare  <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    shrq $32, %rax
 ; X64-NEXT:    shldl $30, %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index e0e1ef7108d0d..9768e4761f47a 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl %dx, %eax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll %cl, %edi
+; X64-NEXT:    movzwl %di, %edx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrl %cl, %eax
-; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    shrl %cl, %edx
+; X64-NEXT:    cmpw %dx, %ax
 ; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movzwl %dx, %esi
 ; X86-NEXT:    shrl %cl, %esi
@@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    addl %eax, %eax
 ; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    shll %cl, %edx
-; X64-NEXT:    movzwl %dx, %esi
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    movzwl %ax, %esi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %esi
-; X64-NEXT:    cmpw %si, %ax
-; X64-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT:    cmovel %edx, %eax
-; X64-NEXT:    cwtl
+; X64-NEXT:    cmpw %si, %dx
+; X64-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; X64-NEXT:    cmovel %eax, %ecx
+; X64-NEXT:    movswl %cx, %eax
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index b8e83da9cf361..762088cfb2935 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzwl %bx, %edi
-; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edi
-; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    cmpw %di, %si
+; X86-NEXT:    movzwl %ax, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    cmpw %si, %dx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
-; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl $65535, %edx # imm = 0xFFFF
+; X86-NEXT:    cmovnel %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %bp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movzwl %bp, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %si
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl %bp, %eax
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    cmpw %ax, %di
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %eax, %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    cmovnel %edx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzwl %bx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl %bx, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
 ; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    movzwl %bp, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl %di, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %dx
+; X86-NEXT:    movzwl %di, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpw %dx, %ax
 ; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movl $65535, %ebx # imm = 0xFFFF
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl %si, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movzwl %dx, %eax
 ; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %dx
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmpw %ax, %si
+; X86-NEXT:    cmovnel %ebx, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %dx, %cx
+; X86-NEXT:    movzwl %ax, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    cmpw %si, %bx
 ; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %si, 12(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
 ; X86-NEXT:    movw %di, 10(%ecx)
-; X86-NEXT:    movw %bx, 8(%ecx)
-; X86-NEXT:    movw %bp, 6(%ecx)
+; X86-NEXT:    movw %bp, 8(%ecx)
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 6(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, 4(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, 2(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index b233855029c58..324fe12de9400 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    movswl %dx, %edx
 ; CHECK-NEXT:    leal (,%rdx,4), %esi
 ; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    shldw $1, %si, %di
+; CHECK-NEXT:    shrl $16, %esi
+; CHECK-NEXT:    shldw $1, %di, %si
 ; CHECK-NEXT:    sarl $14, %edx
 ; CHECK-NEXT:    cmpl $16384, %edx # imm = 0x4000
-; CHECK-NEXT:    cmovgel %eax, %edi
+; CHECK-NEXT:    cmovgel %eax, %esi
 ; CHECK-NEXT:    cmpl $-16384, %edx # imm = 0xC000
-; CHECK-NEXT:    cmovll %ecx, %edi
-; CHECK-NEXT:    pinsrw $3, %edi, %xmm1
+; CHECK-NEXT:    cmovll %ecx, %esi
+; CHECK-NEXT:    pinsrw $3, %esi, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
@@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    pextrw $2, %xmm0, %eax
 ; CHECK-NEXT:    leal (%rax,%rax,2), %eax
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldw $1, %ax, %cx
-; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shldw $1, %dx, %cx
+; CHECK-NEXT:    cmpl $32768, %eax # imm = 0x8000
 ; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    cmovael %eax, %ecx
 ; CHECK-NEXT:    pextrw $1, %xmm0, %edx
 ; CHECK-NEXT:    addl %edx, %edx
 ; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    shrl $16, %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shldw $1, %dx, %di
-; CHECK-NEXT:    cmpl $32768, %esi # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %edx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    shldw $1, %si, %di
+; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
 ; CHECK-NEXT:    cmovael %eax, %edi
 ; CHECK-NEXT:    movd %xmm0, %edx
 ; CHECK-NEXT:    xorl %esi, %esi
@@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
 ; CHECK-NEXT:    pextrw $3, %xmm0, %ecx
 ; CHECK-NEXT:    shll $2, %ecx
 ; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    shrl $16, %edx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    shldw $1, %cx, %si
-; CHECK-NEXT:    cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    movl %ecx, %esi
+; CHECK-NEXT:    shldw $1, %dx, %si
+; CHECK-NEXT:    cmpl $32768, %ecx # imm = 0x8000
 ; CHECK-NEXT:    cmovael %eax, %esi
 ; CHECK-NEXT:    pinsrw $3, %esi, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 320dce840ea57..6cb43234d713b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vpmovw2m %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512VL-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
 ; AVX512VL-NEXT:    vptestnmb %xmm0, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, zeroinitializer
@@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vptestnmw %ymm0, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vptestnmb %zmm0, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    retq
   %a = icmp eq <16 x i8> %0, %1
@@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512F-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $8, %ecx
-; AVX512F-NEXT:    xorb %al, %cl
+; AVX512F-NEXT:    shrl $8, %eax
+; AVX512F-NEXT:    xorb %cl, %al
 ; AVX512F-NEXT:    setnp %al
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    shrl $8, %ecx
-; AVX512BW-NEXT:    xorb %al, %cl
+; AVX512BW-NEXT:    shrl $8, %eax
+; AVX512BW-NEXT:    xorb %cl, %al
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
 ; AVX512VL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovd %k0, %eax
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $8, %ecx
-; AVX512VL-NEXT:    xorb %al, %cl
+; AVX512VL-NEXT:    shrl $8, %eax
+; AVX512VL-NEXT:    xorb %cl, %al
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    shrq $32, %rcx
-; AVX512BW-NEXT:    xorl %eax, %ecx
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    shrl $16, %eax
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrq $32, %rax
 ; AVX512BW-NEXT:    xorl %ecx, %eax
-; AVX512BW-NEXT:    xorb %ah, %al
+; AVX512BW-NEXT:    movl %eax, %ecx
+; AVX512BW-NEXT:    shrl $16, %ecx
+; AVX512BW-NEXT:    xorl %eax, %ecx
+; AVX512BW-NEXT:    xorb %ch, %cl
 ; AVX512BW-NEXT:    setnp %al
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512VL-NEXT:    kmovq %k0, %rax
-; AVX512VL-NEXT:    movq %rax, %rcx
-; AVX512VL-NEXT:    shrq $32, %rcx
-; AVX512VL-NEXT:    xorl %eax, %ecx
-; AVX512VL-NEXT:    movl %ecx, %eax
-; AVX512VL-NEXT:    shrl $16, %eax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrq $32, %rax
 ; AVX512VL-NEXT:    xorl %ecx, %eax
-; AVX512VL-NEXT:    xorb %ah, %al
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    shrl $16, %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    xorb %ch, %cl
 ; AVX512VL-NEXT:    setnp %al
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 3c98eba69ae5b..1c3d27fac4203 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, (%esp)
+; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    andb $12, %bl
-; FALLBACK18-NEXT:    movzbl %bl, %esi
-; FALLBACK18-NEXT:    movl 4(%esp,%esi), %edi
-; FALLBACK18-NEXT:    movl 8(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebp
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ebx,%ebx), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %ebp, %ecx
-; FALLBACK18-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT:    orl %ebp, %edi
-; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
-; FALLBACK18-NEXT:    movl 12(%esp,%esi), %esi
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %eax
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
-; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    movzbl %bl, %edi
+; FALLBACK18-NEXT:    movl 4(%esp,%edi), %ebx
+; FALLBACK18-NEXT:    movl 8(%esp,%edi), %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebp
+; FALLBACK18-NEXT:    notb %al
+; FALLBACK18-NEXT:    leal (%esi,%esi), %edx
+; FALLBACK18-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK18-NEXT:    orl %ebp, %edx
+; FALLBACK18-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
+; FALLBACK18-NEXT:    addl %ebx, %ebx
+; FALLBACK18-NEXT:    shlxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %ebp, %ebx
+; FALLBACK18-NEXT:    movl 12(%esp,%edi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT:    shlxl %eax, %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    orl %esi, %eax
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ecx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK18-NEXT:    movl %eax, 12(%esi)
-; FALLBACK18-NEXT:    movl %edx, 8(%esi)
-; FALLBACK18-NEXT:    movl %edi, (%esi)
-; FALLBACK18-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK18-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK18-NEXT:    movl %eax, 8(%esi)
+; FALLBACK18-NEXT:    movl %ebx, (%esi)
+; FALLBACK18-NEXT:    movl %edx, 4(%esi)
 ; FALLBACK18-NEXT:    addl $44, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    pushl %ebx
 ; FALLBACK22-NEXT:    pushl %edi
 ; FALLBACK22-NEXT:    pushl %esi
-; FALLBACK22-NEXT:    subl $44, %esp
+; FALLBACK22-NEXT:    subl $60, %esp
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %eax
 ; FALLBACK22-NEXT:    shlb $3, %al
 ; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movaps %xmm0, (%esp)
-; FALLBACK22-NEXT:    andb $12, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %eax, %ecx
-; FALLBACK22-NEXT:    notb %cl
-; FALLBACK22-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK22-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK22-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK22-NEXT:    orl %ebx, %edx
-; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK22-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ebx, %edi
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    andb $12, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK22-NEXT:    orl %ebp, %edx
+; FALLBACK22-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl %eax, 12(%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK22-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK22-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK22-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK22-NEXT:    movl %edx, (%esi)
-; FALLBACK22-NEXT:    addl $44, %esp
+; FALLBACK22-NEXT:    addl $60, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
 ; FALLBACK22-NEXT:    popl %ebx
@@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    pushl %ebx
 ; FALLBACK26-NEXT:    pushl %edi
 ; FALLBACK26-NEXT:    pushl %esi
-; FALLBACK26-NEXT:    subl $44, %esp
+; FALLBACK26-NEXT:    subl $60, %esp
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    vmovaps %xmm0, (%esp)
-; FALLBACK26-NEXT:    andb $12, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %eax, %ecx
-; FALLBACK26-NEXT:    notb %cl
-; FALLBACK26-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK26-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK26-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK26-NEXT:    orl %ebx, %edx
-; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK26-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ebx, %edi
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    andb $12, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %edi
+; FALLBACK26-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK26-NEXT:    orl %ebp, %edx
+; FALLBACK26-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl %eax, 12(%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK26-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK26-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK26-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK26-NEXT:    movl %edx, (%esi)
-; FALLBACK26-NEXT:    addl $44, %esp
+; FALLBACK26-NEXT:    addl $60, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
 ; FALLBACK26-NEXT:    popl %ebx
@@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    pushl %ebx
 ; FALLBACK30-NEXT:    pushl %edi
 ; FALLBACK30-NEXT:    pushl %esi
-; FALLBACK30-NEXT:    subl $44, %esp
+; FALLBACK30-NEXT:    subl $60, %esp
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    vmovaps %xmm0, (%esp)
-; FALLBACK30-NEXT:    andb $12, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %eax, %ecx
-; FALLBACK30-NEXT:    notb %cl
-; FALLBACK30-NEXT:    movl 4(%esp,%edi), %ebp
-; FALLBACK30-NEXT:    movl 8(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edx
-; FALLBACK30-NEXT:    shlxl %ecx, %edx, %edx
-; FALLBACK30-NEXT:    orl %ebx, %edx
-; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
-; FALLBACK30-NEXT:    movl 12(%esp,%edi), %edi
-; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ebx, %edi
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    andb $12, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %edi
+; FALLBACK30-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 20(%esp,%edi), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%esp,%edi), %ebx
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %eax, %edx, %edx
+; FALLBACK30-NEXT:    orl %ebp, %edx
+; FALLBACK30-NEXT:    movl 28(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edi
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %eax, %ebx, %eax
+; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %ecx
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl %eax, 12(%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
+; FALLBACK30-NEXT:    movl %ecx, 12(%esi)
+; FALLBACK30-NEXT:    movl %eax, 4(%esi)
 ; FALLBACK30-NEXT:    movl %edi, 8(%esi)
 ; FALLBACK30-NEXT:    movl %edx, (%esi)
-; FALLBACK30-NEXT:    addl $44, %esp
+; FALLBACK30-NEXT:    addl $60, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
 ; FALLBACK30-NEXT:    popl %ebx
@@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 4(%ecx), %esi
 ; FALLBACK18-NEXT:    movl 8(%ecx), %edi
 ; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
-; FALLBACK18-NEXT:    movzbl (%eax), %eax
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    shlb $3, %bl
+; FALLBACK18-NEXT:    movzbl (%eax), %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, (%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $12, %al
-; FALLBACK18-NEXT:    negb %al
-; FALLBACK18-NEXT:    movsbl %al, %edx
-; FALLBACK18-NEXT:    movl 16(%esp,%edx), %edi
-; FALLBACK18-NEXT:    movl 20(%esp,%edx), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
-; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    andb $12, %bl
+; FALLBACK18-NEXT:    negb %bl
+; FALLBACK18-NEXT:    movsbl %bl, %esi
+; FALLBACK18-NEXT:    movl 16(%esp,%esi), %ebx
+; FALLBACK18-NEXT:    movl 20(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edi
 ; FALLBACK18-NEXT:    notb %al
-; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %eax, %edi, %edi
-; FALLBACK18-NEXT:    orl %esi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
-; FALLBACK18-NEXT:    movl 24(%esp,%edx), %edx
-; FALLBACK18-NEXT:    shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebp
+; FALLBACK18-NEXT:    shrl %ebx
+; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
+; FALLBACK18-NEXT:    movl 24(%esp,%esi), %esi
+; FALLBACK18-NEXT:    shlxl %ecx, %esi, %ecx
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK18-NEXT:    orl %edi, %esi
 ; FALLBACK18-NEXT:    shrl %edx
-; FALLBACK18-NEXT:    shrxl %eax, %edx, %edx
-; FALLBACK18-NEXT:    orl %esi, %edx
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %eax, %ecx, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK18-NEXT:    movl %ebp, (%ecx)
 ; FALLBACK18-NEXT:    movl %eax, 8(%ecx)
-; FALLBACK18-NEXT:    movl %edx, 12(%ecx)
-; FALLBACK18-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK18-NEXT:    movl %esi, 12(%ecx)
+; FALLBACK18-NEXT:    movl %ebx, 4(%ecx)
 ; FALLBACK18-NEXT:    addl $44, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %eax
 ; FALLBACK22-NEXT:    shlb $3, %al
 ; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
 ; FALLBACK22-NEXT:    movaps %xmm1, (%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $12, %cl
-; FALLBACK22-NEXT:    negb %cl
-; FALLBACK22-NEXT:    movsbl %cl, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK22-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK22-NEXT:    movl %eax, %ebx
-; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %edx
-; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT:    orl %esi, %edx
-; FALLBACK22-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl %esi, %ebp
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    andb $12, %dl
+; FALLBACK22-NEXT:    negb %dl
+; FALLBACK22-NEXT:    movsbl %dl, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
 ; FALLBACK22-NEXT:    shrl %ebp
-; FALLBACK22-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT:    orl %edi, %ebp
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl %eax, (%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK22-NEXT:    movl %edx, 12(%esi)
+; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %ebx, %ebp
+; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK22-NEXT:    shrl %edx
+; FALLBACK22-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, (%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK22-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK22-NEXT:    addl $44, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $12, %cl
-; FALLBACK26-NEXT:    negb %cl
-; FALLBACK26-NEXT:    movsbl %cl, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK26-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK26-NEXT:    movl %eax, %ebx
-; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %edx
-; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT:    orl %esi, %edx
-; FALLBACK26-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl %esi, %ebp
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    andb $12, %dl
+; FALLBACK26-NEXT:    negb %dl
+; FALLBACK26-NEXT:    movsbl %dl, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
 ; FALLBACK26-NEXT:    shrl %ebp
-; FALLBACK26-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK26-NEXT:    orl %edi, %ebp
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl %eax, (%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK26-NEXT:    movl %edx, 12(%esi)
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %ebx, %ebp
+; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK26-NEXT:    shrl %edx
+; FALLBACK26-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %ecx, (%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK26-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK26-NEXT:    addl $44, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $12, %cl
-; FALLBACK30-NEXT:    negb %cl
-; FALLBACK30-NEXT:    movsbl %cl, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl 24(%esp,%ecx), %edx
-; FALLBACK30-NEXT:    shlxl %eax, %edx, %edi
-; FALLBACK30-NEXT:    movl %eax, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %edx
-; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT:    orl %esi, %edx
-; FALLBACK30-NEXT:    movl 20(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    andb $12, %dl
+; FALLBACK30-NEXT:    negb %dl
+; FALLBACK30-NEXT:    movsbl %dl, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 24(%esp,%edx), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ebx
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl 20(%esp,%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
 ; FALLBACK30-NEXT:    shrl %ebp
-; FALLBACK30-NEXT:    shrxl %ebx, %ebp, %ebp
-; FALLBACK30-NEXT:    orl %edi, %ebp
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    movl 16(%esp,%ecx), %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %eax
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl %eax, (%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
-; FALLBACK30-NEXT:    movl %edx, 12(%esi)
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %ebx, %ebp
+; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    movl 16(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %ecx
+; FALLBACK30-NEXT:    shrl %edx
+; FALLBACK30-NEXT:    shrxl %eax, %edx, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, (%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 8(%edx)
+; FALLBACK30-NEXT:    movl %esi, 12(%edx)
 ; FALLBACK30-NEXT:    addl $44, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%esp,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %esi, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %edi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%esi)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
@@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_32bytes:
@@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %esi
 ; FALLBACK6-NEXT:    andb $24, %cl
 ; FALLBACK6-NEXT:    movzbl %cl, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    shrxq %rsi, -72(%rsp,%rcx), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT:    leaq (%rcx,%rcx), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_32bytes:
@@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-LABEL: lshr_32bytes:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %eax
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    movzbl %cl, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movl %ecx, %esi
+; FALLBACK10-NEXT:    andb $24, %al
+; FALLBACK10-NEXT:    movzbl %al, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -72(%rsp,%rax), %rdi
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rax), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %rdi, %rcx
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-LABEL: lshr_32bytes:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %eax
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    movzbl %cl, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movl %ecx, %esi
+; FALLBACK14-NEXT:    andb $24, %al
+; FALLBACK14-NEXT:    movzbl %al, %eax
+; FALLBACK14-NEXT:    shrxq %rsi, -72(%rsp,%rax), %rdi
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -64(%rsp,%rax), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rax), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rax), %rax
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %rdi, %rcx
+; FALLBACK14-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, %eax
-; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %ebx, %ecx
+; FALLBACK18-NEXT:    shlb $3, %cl
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, %eax
 ; FALLBACK18-NEXT:    andb $28, %bl
-; FALLBACK18-NEXT:    movzbl %bl, %edi
-; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %edx
+; FALLBACK18-NEXT:    movzbl %bl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    orl %edi, %edx
 ; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %ebx, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%esp,%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    movl %ecx, %eax
-; FALLBACK18-NEXT:    shrxl %ecx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %ebx
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %edi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT:    orl %esi, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT:    orl %eax, %edi
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK18-NEXT:    movl %edi, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 16(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    movl %esi, 28(%edi)
+; FALLBACK18-NEXT:    movl %ecx, 24(%edi)
+; FALLBACK18-NEXT:    movl %eax, 16(%edi)
+; FALLBACK18-NEXT:    movl %edx, 20(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, (%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edi)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlb $3, %dl
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    shlb $3, %cl
 ; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %edx, %eax
-; FALLBACK22-NEXT:    notb %al
-; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl %eax, %ebp
-; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %ebx
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    movl %eax, %edi
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK22-NEXT:    movl %ebp, %ecx
-; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT:    addl %ebx, %ebx
+; FALLBACK22-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK22-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK22-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK22-NEXT:    movl %edi, %edx
+; FALLBACK22-NEXT:    orl %esi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK22-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ebp, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT:    movl %eax, 28(%edx)
-; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK22-NEXT:    movl %edi, 24(%edx)
-; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    orl %eax, %ebx
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK22-NEXT:    movl %edi, 16(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlb $3, %dl
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %ecx
+; FALLBACK26-NEXT:    shlb $3, %cl
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %edx, %eax
-; FALLBACK26-NEXT:    notb %al
-; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl %eax, %ebp
-; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %ebx
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    movl %eax, %edi
+; FALLBACK26-NEXT:    notb %cl
+; FALLBACK26-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK26-NEXT:    movl %ebp, %ecx
-; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT:    addl %ebx, %ebx
+; FALLBACK26-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK26-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK26-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK26-NEXT:    movl %edi, %edx
+; FALLBACK26-NEXT:    orl %esi, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK26-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK26-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ebp, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT:    movl %eax, 28(%edx)
-; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK26-NEXT:    movl %edi, 24(%edx)
-; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK26-NEXT:    orl %eax, %ebx
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK26-NEXT:    movl %edi, 16(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlb $3, %dl
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %ecx
+; FALLBACK30-NEXT:    shlb $3, %cl
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %edx, %eax
-; FALLBACK30-NEXT:    notb %al
-; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl %eax, %ebp
-; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %ebx
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    movl %eax, %edi
+; FALLBACK30-NEXT:    notb %cl
+; FALLBACK30-NEXT:    movl 36(%esp,%ebx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK30-NEXT:    movl %ebp, %ecx
-; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT:    addl %ebx, %ebx
+; FALLBACK30-NEXT:    movl 48(%esp,%ebx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK30-NEXT:    movl 44(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK30-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edi, %edx, %esi
+; FALLBACK30-NEXT:    movl %edi, %edx
+; FALLBACK30-NEXT:    orl %esi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%ebx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK30-NEXT:    shlxl %ecx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 52(%esp,%ebx), %eax
+; FALLBACK30-NEXT:    shrxl %edi, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 60(%esp,%ebx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
 ; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
-; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ebp, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT:    movl %eax, 28(%edx)
-; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK30-NEXT:    movl %edi, 24(%edx)
-; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    orl %eax, %ebx
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 24(%edx)
+; FALLBACK30-NEXT:    movl %edi, 16(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $6, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
@@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %esi
 ; FALLBACK6-NEXT:    andb $6, %cl
 ; FALLBACK6-NEXT:    movzbl %cl, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    shrxq %rsi, -72(%rsp,%rcx,4), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT:    leaq (%rcx,%rcx), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
@@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    movl %ecx, %eax
-; FALLBACK10-NEXT:    shlb $5, %al
+; FALLBACK10-NEXT:    movzbl (%rsi), %eax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    shlb $5, %cl
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $6, %cl
-; FALLBACK10-NEXT:    movzbl %cl, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movl %ecx, %esi
+; FALLBACK10-NEXT:    andb $6, %al
+; FALLBACK10-NEXT:    movzbl %al, %eax
+; FALLBACK10-NEXT:    shrxq %rsi, -72(%rsp,%rax,4), %rdi
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rax,4), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %rdi, %rcx
+; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    movl %ecx, %eax
-; FALLBACK14-NEXT:    shlb $5, %al
+; FALLBACK14-NEXT:    movzbl (%rsi), %eax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    shlb $5, %cl
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $6, %cl
-; FALLBACK14-NEXT:    movzbl %cl, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movl %ecx, %esi
+; FALLBACK14-NEXT:    andb $6, %al
+; FALLBACK14-NEXT:    movzbl %al, %eax
+; FALLBACK14-NEXT:    shrxq %rsi, -72(%rsp,%rax,4), %rdi
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -64(%rsp,%rax,4), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rax,4), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rax,4), %rax
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %rdi, %rcx
+; FALLBACK14-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
 ; FALLBACK2-NEXT:    negb %sil
-; FALLBACK2-NEXT:    movsbq %sil, %rsi
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movsbq %sil, %rdi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rdi), %r8
+; FALLBACK2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK2-NEXT:    shrq %rdi
 ; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_32bytes:
@@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6:       # %bb.0:
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    andb $24, %cl
-; FALLBACK6-NEXT:    negb %cl
-; FALLBACK6-NEXT:    movsbq %cl, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    negb %sil
+; FALLBACK6-NEXT:    movsbq %sil, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r9, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    shrq %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r8, %rcx
-; FALLBACK6-NEXT:    shrq %r9
-; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_32bytes:
@@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-LABEL: shl_32bytes:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    negb %cl
-; FALLBACK10-NEXT:    movsbq %cl, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    negb %sil
+; FALLBACK10-NEXT:    movsbq %sil, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r9, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    shrq %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r8, %rcx
-; FALLBACK10-NEXT:    shrq %r9
-; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-LABEL: shl_32bytes:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    negb %cl
-; FALLBACK14-NEXT:    movsbq %cl, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    negb %sil
+; FALLBACK14-NEXT:    movsbq %sil, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r9, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    shrq %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r8, %rcx
-; FALLBACK14-NEXT:    shrq %r9
-; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edx, %eax
+; FALLBACK18-NEXT:    movl %eax, %ebp
 ; FALLBACK18-NEXT:    andb $28, %bl
 ; FALLBACK18-NEXT:    negb %bl
 ; FALLBACK18-NEXT:    movsbl %bl, %esi
 ; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 68(%esp,%esi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    movl %edx, %ecx
-; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    movl 68(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %ebx
 ; FALLBACK18-NEXT:    orl %edi, %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 72(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, %edi
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    movl %ebp, %esi
+; FALLBACK18-NEXT:    shlxl %ebp, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ecx, %eax, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %esi, %ebx, %ebx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %ebx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT:    movl 80(%esp,%ebp), %ecx
+; FALLBACK18-NEXT:    movl %ecx, %ebx
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %eax
-; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebp
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %esi, %ebx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %esi, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %esi, %eax
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %edi
-; FALLBACK18-NEXT:    orl %eax, %edi
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, 92(%esp,%esi), %ebp
-; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ecx, %edi
+; FALLBACK18-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    shlxl %esi, 92(%esp,%ecx), %ebp
+; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %ecx
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    orl %ebp, %esi
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %edx
-; FALLBACK18-NEXT:    orl %eax, %edx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl %edx, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 28(%eax)
-; FALLBACK18-NEXT:    movl %edi, 16(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    movl %ecx, (%edx)
+; FALLBACK18-NEXT:    movl %eax, 24(%edx)
+; FALLBACK18-NEXT:    movl %esi, 28(%edx)
+; FALLBACK18-NEXT:    movl %edi, 16(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 20(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shlb $3, %al
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    shlb $3, %cl
 ; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    negb %cl
-; FALLBACK22-NEXT:    movsbl %cl, %edx
-; FALLBACK22-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
-; FALLBACK22-NEXT:    movl %eax, %ebx
-; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %esi
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edi, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK22-NEXT:    movl %esi, %edi
+; FALLBACK22-NEXT:    movl %ecx, %ebx
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    negb %dl
+; FALLBACK22-NEXT:    movsbl %dl, %edx
+; FALLBACK22-NEXT:    movl 84(%esp,%edx), %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK22-NEXT:    notb %cl
+; FALLBACK22-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    shrl %edi
-; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ecx, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK22-NEXT:    movl %esi, %edi
 ; FALLBACK22-NEXT:    shrl %edi
-; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ecx, %ebp
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %ebp
+; FALLBACK22-NEXT:    shrl %ebp
+; FALLBACK22-NEXT:    shrxl %ecx, %ebp, %ebp
+; FALLBACK22-NEXT:    orl %esi, %ebp
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK22-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK22-NEXT:    movl %ebp, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
 ; FALLBACK22-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK22-NEXT:    movl %esi, %ecx
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK22-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK22-NEXT:    shlxl %eax, %edx, %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
 ; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    shrl %edx
-; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT:    orl %edi, %edx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK22-NEXT:    movl %edi, (%esi)
-; FALLBACK22-NEXT:    movl %edx, 28(%esi)
-; FALLBACK22-NEXT:    movl %eax, 24(%esi)
-; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK22-NEXT:    shrxl %ecx, %eax, %esi
+; FALLBACK22-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ecx, %eax, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT:    movl %ebp, (%ecx)
+; FALLBACK22-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK22-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK22-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 12(%esi)
+; FALLBACK22-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 16(%esi)
+; FALLBACK22-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%esi)
+; FALLBACK22-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK22-NEXT:    addl $108, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %eax
 ; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    negb %cl
-; FALLBACK26-NEXT:    movsbl %cl, %edx
-; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
 ; FALLBACK26-NEXT:    movl %eax, %ebx
-; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %esi
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %edi, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK26-NEXT:    movl %esi, %edi
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    negb %dl
+; FALLBACK26-NEXT:    movsbl %dl, %edx
+; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    shrl %edi
-; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ecx, %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK26-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK26-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK26-NEXT:    movl %esi, %edi
 ; FALLBACK26-NEXT:    shrl %edi
-; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ecx, %ebp
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %ebp
+; FALLBACK26-NEXT:    shrl %ebp
+; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT:    orl %esi, %ebp
+; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK26-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK26-NEXT:    movl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
 ; FALLBACK26-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK26-NEXT:    movl %esi, %ecx
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %edi, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK26-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK26-NEXT:    shlxl %eax, %edx, %esi
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %edi
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %esi
+; FALLBACK26-NEXT:    movl 88(%esp,%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT:    movl %ebp, (%ecx)
+; FALLBACK26-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK26-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 4(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    shrl %edx
-; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT:    orl %edi, %edx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK26-NEXT:    movl %edi, (%esi)
-; FALLBACK26-NEXT:    movl %edx, 28(%esi)
-; FALLBACK26-NEXT:    movl %eax, 24(%esi)
-; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK26-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 12(%esi)
+; FALLBACK26-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 16(%esi)
+; FALLBACK26-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%esi)
+; FALLBACK26-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK26-NEXT:    addl $108, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %eax
 ; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    negb %cl
-; FALLBACK30-NEXT:    movsbl %cl, %edx
-; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
 ; FALLBACK30-NEXT:    movl %eax, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 76(%esp,%edx), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %esi
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %edi, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 72(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl %esi, %edi
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    negb %dl
+; FALLBACK30-NEXT:    movsbl %dl, %edx
+; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT:    notb %al
+; FALLBACK30-NEXT:    movl 80(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    shrl %edi
-; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ecx, %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %ecx
-; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 76(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, %edi
 ; FALLBACK30-NEXT:    shrl %edi
-; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ecx, %ebp
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
-; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl %esi, %ecx
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %edi, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK30-NEXT:    movl 88(%esp,%edx), %edx
-; FALLBACK30-NEXT:    shlxl %eax, %edx, %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    shrl %edx
-; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT:    orl %edi, %edx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK30-NEXT:    movl %edi, (%esi)
-; FALLBACK30-NEXT:    movl %edx, 28(%esi)
-; FALLBACK30-NEXT:    movl %eax, 24(%esi)
-; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
-; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %ebp
+; FALLBACK30-NEXT:    shrl %ebp
+; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT:    orl %esi, %ebp
+; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %esi
+; FALLBACK30-NEXT:    movl 68(%esp,%edx), %ebp
+; FALLBACK30-NEXT:    movl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %edi
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %esi
+; FALLBACK30-NEXT:    movl 88(%esp,%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT:    movl %ebp, (%ecx)
+; FALLBACK30-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK30-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 12(%esi)
+; FALLBACK30-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 16(%esi)
+; FALLBACK30-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%esi)
+; FALLBACK30-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK30-NEXT:    addl $108, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    shlb $2, %sil
 ; FALLBACK2-NEXT:    andb $24, %sil
 ; FALLBACK2-NEXT:    negb %sil
-; FALLBACK2-NEXT:    movsbq %sil, %rsi
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movsbq %sil, %rdi
+; FALLBACK2-NEXT:    movq -40(%rsp,%rdi), %r8
+; FALLBACK2-NEXT:    movq -32(%rsp,%rdi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; FALLBACK2-NEXT:    movq -24(%rsp,%rdi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK2-NEXT:    shrq %rdi
 ; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, (%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_32bytes_dwordOff:
@@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK6:       # %bb.0:
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK6-NEXT:    movl %ecx, %eax
+; FALLBACK6-NEXT:    movzbl (%rsi), %esi
+; FALLBACK6-NEXT:    movl %esi, %eax
 ; FALLBACK6-NEXT:    shlb $5, %al
 ; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    shlb $2, %cl
-; FALLBACK6-NEXT:    andb $24, %cl
-; FALLBACK6-NEXT:    negb %cl
-; FALLBACK6-NEXT:    movsbq %cl, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    shlb $2, %sil
+; FALLBACK6-NEXT:    andb $24, %sil
+; FALLBACK6-NEXT:    negb %sil
+; FALLBACK6-NEXT:    movsbq %sil, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %rdi, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK6-NEXT:    shrq %rsi
+; FALLBACK6-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK6-NEXT:    orq %r9, %rsi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    shrq %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r8, %rcx
-; FALLBACK6-NEXT:    shrq %r9
-; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, (%rdx)
+; FALLBACK6-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_32bytes_dwordOff:
@@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK10-LABEL: shl_32bytes_dwordOff:
 ; FALLBACK10:       # %bb.0:
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK10-NEXT:    movl %ecx, %eax
+; FALLBACK10-NEXT:    movzbl (%rsi), %esi
+; FALLBACK10-NEXT:    movl %esi, %eax
 ; FALLBACK10-NEXT:    shlb $5, %al
 ; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    shlb $2, %cl
-; FALLBACK10-NEXT:    andb $24, %cl
-; FALLBACK10-NEXT:    negb %cl
-; FALLBACK10-NEXT:    movsbq %cl, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    shlb $2, %sil
+; FALLBACK10-NEXT:    andb $24, %sil
+; FALLBACK10-NEXT:    negb %sil
+; FALLBACK10-NEXT:    movsbq %sil, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rdi, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK10-NEXT:    shrq %rsi
+; FALLBACK10-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK10-NEXT:    orq %r9, %rsi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    shrq %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r8, %rcx
-; FALLBACK10-NEXT:    shrq %r9
-; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, (%rdx)
+; FALLBACK10-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; FALLBACK14-LABEL: shl_32bytes_dwordOff:
 ; FALLBACK14:       # %bb.0:
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
-; FALLBACK14-NEXT:    movl %ecx, %eax
+; FALLBACK14-NEXT:    movzbl (%rsi), %esi
+; FALLBACK14-NEXT:    movl %esi, %eax
 ; FALLBACK14-NEXT:    shlb $5, %al
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    shlb $2, %cl
-; FALLBACK14-NEXT:    andb $24, %cl
-; FALLBACK14-NEXT:    negb %cl
-; FALLBACK14-NEXT:    movsbq %cl, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    shlb $2, %sil
+; FALLBACK14-NEXT:    andb $24, %sil
+; FALLBACK14-NEXT:    negb %sil
+; FALLBACK14-NEXT:    movsbq %sil, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, -16(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r9
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rdi, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %r10
+; FALLBACK14-NEXT:    shrq %rsi
+; FALLBACK14-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK14-NEXT:    orq %r9, %rsi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rcx
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    shrq %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r8, %rcx
-; FALLBACK14-NEXT:    shrq %r9
-; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rax
 ; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, (%rdx)
+; FALLBACK14-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rsi, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $24, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_32bytes:
@@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %ecx
 ; FALLBACK6-NEXT:    andb $24, %sil
-; FALLBACK6-NEXT:    movzbl %sil, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movzbl %sil, %esi
+; FALLBACK6-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_32bytes:
@@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movl %eax, %ecx
 ; FALLBACK10-NEXT:    andb $24, %sil
-; FALLBACK10-NEXT:    movzbl %sil, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movzbl %sil, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %rdi, %rax
+; FALLBACK10-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    retq
 ;
 ; FALLBACK11-LABEL: ashr_32bytes:
@@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andb $24, %sil
-; FALLBACK14-NEXT:    movzbl %sil, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movzbl %sil, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -72(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %rdi, %rax
+; FALLBACK14-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    retq
 ;
 ; FALLBACK15-LABEL: ashr_32bytes:
@@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    pushl %edi
 ; FALLBACK18-NEXT:    pushl %esi
 ; FALLBACK18-NEXT:    subl $108, %esp
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; FALLBACK18-NEXT:    movl (%esi), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 8(%esi), %ebx
 ; FALLBACK18-NEXT:    movl 12(%esi), %ebp
 ; FALLBACK18-NEXT:    movl 16(%esi), %edi
-; FALLBACK18-NEXT:    movzbl (%ecx), %ecx
-; FALLBACK18-NEXT:    movl 20(%esi), %edx
+; FALLBACK18-NEXT:    movzbl (%edx), %edx
+; FALLBACK18-NEXT:    movl 20(%esi), %ecx
 ; FALLBACK18-NEXT:    movl 24(%esi), %eax
 ; FALLBACK18-NEXT:    movl 28(%esi), %esi
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ecx, %eax
-; FALLBACK18-NEXT:    shlb $3, %al
+; FALLBACK18-NEXT:    movl %edx, %ecx
+; FALLBACK18-NEXT:    shlb $3, %cl
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    sarl $31, %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $28, %cl
-; FALLBACK18-NEXT:    movzbl %cl, %edi
-; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
-; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %ebp
-; FALLBACK18-NEXT:    orl %ebx, %ebp
-; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%esp,%edi), %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %ebx, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, %ecx, %ecx
-; FALLBACK18-NEXT:    movl %eax, %ebx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    andb $28, %dl
+; FALLBACK18-NEXT:    movzbl %dl, %esi
+; FALLBACK18-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl 40(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 48(%esp,%esi), %edx
+; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %ebx
+; FALLBACK18-NEXT:    movl 44(%esp,%esi), %edx
+; FALLBACK18-NEXT:    shrxl %eax, %edx, %edi
+; FALLBACK18-NEXT:    orl %edi, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %ebp, %edi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    addl %edx, %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT:    orl %esi, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 56(%esp,%esi), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %edx
+; FALLBACK18-NEXT:    shlxl %ecx, %edx, %edx
+; FALLBACK18-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebx
+; FALLBACK18-NEXT:    orl %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT:    sarxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %edx
-; FALLBACK18-NEXT:    orl %eax, %edx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK18-NEXT:    movl %edx, 24(%eax)
-; FALLBACK18-NEXT:    movl %esi, 16(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, (%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shlxl %ecx, %eax, %eax
+; FALLBACK18-NEXT:    orl %ebx, %eax
+; FALLBACK18-NEXT:    movl 60(%esp,%esi), %esi
+; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT:    shlxl %ecx, %ebx, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %ecx
+; FALLBACK18-NEXT:    sarxl %ebp, %esi, %esi
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT:    movl %esi, 28(%edi)
+; FALLBACK18-NEXT:    movl %ecx, 24(%edi)
+; FALLBACK18-NEXT:    movl %eax, 16(%edi)
+; FALLBACK18-NEXT:    movl %edx, 20(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, (%edi)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edi)
 ; FALLBACK18-NEXT:    addl $108, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK22-NEXT:    movl 20(%ecx), %edi
-; FALLBACK22-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK22-NEXT:    movl 28(%ecx), %edx
-; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shlb $3, %al
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK22-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK22-NEXT:    movzbl (%eax), %edx
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    shlb $3, %bl
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    sarl $31, %edx
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    andb $28, %cl
-; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %eax, %edx
-; FALLBACK22-NEXT:    notb %dl
-; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT:    movl %eax, %ecx
-; FALLBACK22-NEXT:    orl %ebx, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    sarl $31, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT:    movl %ebx, %eax
+; FALLBACK22-NEXT:    andb $28, %dl
+; FALLBACK22-NEXT:    movzbl %dl, %ecx
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    movl %eax, %ebp
+; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT:    addl %ebx, %ebx
-; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ecx, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %esi, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT:    movl %eax, 28(%edx)
-; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK22-NEXT:    movl %edi, 24(%edx)
-; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK22-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK22-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK22-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK22-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK22-NEXT:    movl %ebp, %edx
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    orl %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT:    orl %eax, %esi
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    addl %eax, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %edx, %eax
+; FALLBACK22-NEXT:    sarxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
+; FALLBACK22-NEXT:    movl %esi, 24(%edx)
+; FALLBACK22-NEXT:    movl %edi, 16(%edx)
+; FALLBACK22-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
 ; FALLBACK26-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK26-NEXT:    movl 20(%ecx), %edi
-; FALLBACK26-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK26-NEXT:    movl 28(%ecx), %edx
-; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %eax
-; FALLBACK26-NEXT:    shlb $3, %al
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK26-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK26-NEXT:    movzbl (%eax), %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    shlb $3, %bl
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    sarl $31, %edx
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    andb $28, %cl
-; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %eax, %edx
-; FALLBACK26-NEXT:    notb %dl
-; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT:    movl %eax, %ecx
-; FALLBACK26-NEXT:    orl %ebx, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    sarl $31, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT:    movl %ebx, %eax
+; FALLBACK26-NEXT:    andb $28, %dl
+; FALLBACK26-NEXT:    movzbl %dl, %ecx
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    movl %eax, %ebp
+; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %ecx, %eax
-; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT:    addl %ebx, %ebx
-; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ecx, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %esi, %ecx
+; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK26-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK26-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %edx, %edx
+; FALLBACK26-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK26-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK26-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK26-NEXT:    movl %ebp, %edx
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    orl %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK26-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    orl %eax, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %edx, %eax
+; FALLBACK26-NEXT:    sarxl %ebx, %ecx, %ecx
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT:    movl %eax, 28(%edx)
-; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK26-NEXT:    movl %edi, 24(%edx)
-; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
+; FALLBACK26-NEXT:    movl %esi, 24(%edx)
+; FALLBACK26-NEXT:    movl %edi, 16(%edx)
+; FALLBACK26-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
 ; FALLBACK30-NEXT:    movl 16(%ecx), %esi
 ; FALLBACK30-NEXT:    movl 20(%ecx), %edi
-; FALLBACK30-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK30-NEXT:    movl 28(%ecx), %edx
-; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %eax
-; FALLBACK30-NEXT:    shlb $3, %al
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl 24(%ecx), %ebp
+; FALLBACK30-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK30-NEXT:    movzbl (%eax), %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    shlb $3, %bl
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    sarl $31, %edx
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    andb $28, %cl
-; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %eax, %edx
-; FALLBACK30-NEXT:    notb %dl
-; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT:    movl %eax, %ecx
-; FALLBACK30-NEXT:    orl %ebx, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %edx, %ebx, %eax
-; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    sarl $31, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT:    movl %ebx, %eax
+; FALLBACK30-NEXT:    andb $28, %dl
+; FALLBACK30-NEXT:    movzbl %dl, %ecx
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    movl %eax, %ebp
+; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    movl 36(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %ecx, %eax
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT:    addl %ebx, %ebx
-; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT:    sarxl %eax, %edi, %eax
-; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ecx, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %esi, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT:    movl %eax, 28(%edx)
-; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
-; FALLBACK30-NEXT:    movl %edi, 24(%edx)
-; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
+; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK30-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK30-NEXT:    movl 44(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %edx, %edx
+; FALLBACK30-NEXT:    shlxl %ebx, %edx, %edi
+; FALLBACK30-NEXT:    movl 40(%esp,%ecx), %edx
+; FALLBACK30-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ebp, %edx, %esi
+; FALLBACK30-NEXT:    movl %ebp, %edx
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 56(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    movl 52(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT:    orl %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 60(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    leal (%ecx,%ecx), %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    orl %eax, %esi
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %edx, %eax
+; FALLBACK30-NEXT:    sarxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %ecx, 28(%edx)
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
+; FALLBACK30-NEXT:    movl %esi, 24(%edx)
+; FALLBACK30-NEXT:    movl %edi, 16(%edx)
+; FALLBACK30-NEXT:    movl %ebp, 20(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 8(%edx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andb $6, %sil
-; FALLBACK2-NEXT:    movzbl %sil, %ecx
-; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT:    movzbl %sil, %esi
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi,4), %rdi
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi,4), %r8
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK2-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
+; FALLBACK2-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %r9
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r8, %rdi
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r9, %rsi
-; FALLBACK2-NEXT:    addq %rcx, %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT:    orq %r10, %rax
-; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK2-NEXT:    leaq (%rsi,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK2-NEXT:    orq %r8, %rax
+; FALLBACK2-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK2-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, (%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, (%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
@@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT:    movl %eax, %ecx
 ; FALLBACK6-NEXT:    andb $6, %sil
-; FALLBACK6-NEXT:    movzbl %sil, %ecx
-; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT:    movzbl %sil, %esi
+; FALLBACK6-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK6-NEXT:    notb %al
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %rsi, %rdi
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %r9, %rcx
-; FALLBACK6-NEXT:    addq %r8, %r8
-; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT:    orq %r10, %rax
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK6-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK6-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %rdi, %r10
+; FALLBACK6-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK6-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %rdi, %r11
+; FALLBACK6-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK6-NEXT:    orq %rdi, %rax
+; FALLBACK6-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, (%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, (%rdx)
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
@@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT:    movl %eax, %ecx
 ; FALLBACK10-NEXT:    andb $6, %sil
-; FALLBACK10-NEXT:    movzbl %sil, %ecx
-; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT:    movzbl %sil, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK10-NEXT:    notb %al
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %rsi, %rdi
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %r9, %rcx
-; FALLBACK10-NEXT:    addq %r8, %r8
-; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT:    orq %r10, %rax
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK10-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK10-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %rdi, %r10
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rdi, %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %rdi, %rax
+; FALLBACK10-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, (%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, (%rdx)
 ; FALLBACK10-NEXT:    retq
 ;
 ; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
@@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andb $6, %sil
-; FALLBACK14-NEXT:    movzbl %sil, %ecx
-; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT:    movzbl %sil, %esi
+; FALLBACK14-NEXT:    shrxq %rcx, -72(%rsp,%rsi,4), %rdi
 ; FALLBACK14-NEXT:    notb %al
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %rsi, %rdi
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %r9, %rcx
-; FALLBACK14-NEXT:    addq %r8, %r8
-; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT:    orq %r10, %rax
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi,4), %r8
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi,4), %r9
+; FALLBACK14-NEXT:    leaq (%r8,%r8), %r10
+; FALLBACK14-NEXT:    shlxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %rdi, %r10
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rdi
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi,4), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rdi, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r8, %rdi
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %rdi, %rax
+; FALLBACK14-NEXT:    sarxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT:    movq %rcx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, (%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, (%rdx)
 ; FALLBACK14-NEXT:    retq
 ;
 ; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
@@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: lshr_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    movl %ecx, %esi
 ; FALLBACK2-NEXT:    andl $56, %eax
-; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT:    movl %ecx, %r12d
-; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r8, %r9
+; FALLBACK2-NEXT:    notb %cl
+; FALLBACK2-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    leaq (%r14,%r14), %r9
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r9
 ; FALLBACK2-NEXT:    orq %rbx, %r9
-; FALLBACK2-NEXT:    addq %rdi, %rdi
-; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK2-NEXT:    orq %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %rbx
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r15
+; FALLBACK2-NEXT:    leaq (%r15,%r15), %r12
+; FALLBACK2-NEXT:    shlxq %rcx, %r12, %r12
+; FALLBACK2-NEXT:    orq %rbx, %r12
+; FALLBACK2-NEXT:    shrxq %rsi, %r14, %rbx
 ; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT:    orq %r15, %r8
-; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
-; FALLBACK2-NEXT:    addq %rax, %rax
-; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT:    orq %r13, %rax
-; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r15, %rbx
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    leaq (%rax,%rax), %r14
+; FALLBACK2-NEXT:    shlxq %rcx, %r14, %rcx
+; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK2-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r12, 40(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: lshr_64bytes:
@@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: lshr_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
 ; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
@@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    movl %ecx, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
-; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK6-NEXT:    notb %cl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK6-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r8, %rdi
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK6-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK6-NEXT:    addq %r11, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK6-NEXT:    orq %rbx, %r11
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK6-NEXT:    orq %r15, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK6-NEXT:    addq %rbx, %rbx
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r14, %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %r9, %rdi
-; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT:    orq %r14, %r9
-; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT:    orq %r15, %r10
-; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT:    orq %r13, %rax
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %rbp, %rcx
-; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK6-NEXT:    orq %r14, %r15
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK6-NEXT:    orq %r10, %rcx
+; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rax
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r8, (%rdx)
-; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
 ; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: lshr_64bytes:
@@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK9-NEXT:    pushq %rbx
 ; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    movl (%rsi), %edi
 ; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK9-NEXT:    leal (,%rdi,8), %ecx
 ; FALLBACK9-NEXT:    andl $56, %ecx
-; FALLBACK9-NEXT:    andl $56, %eax
-; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq %r9, %rsi
-; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    andl $56, %edi
+; FALLBACK9-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq %r9, %rax
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK9-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK9-NEXT:    movq %r10, %r8
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    movq %r11, %rbx
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r15
 ; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r14
 ; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK9-NEXT:    shrq %cl, %r11
 ; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
 ; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
-; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK9-NEXT:    movq %r14, (%rdx)
 ; FALLBACK9-NEXT:    popq %rbx
 ; FALLBACK9-NEXT:    popq %r14
@@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: lshr_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
 ; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movl (%rsi), %esi
 ; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
-; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
-; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
-; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %r9, %rdi
-; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT:    orq %r14, %r9
-; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT:    orq %r15, %r10
-; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT:    orq %r13, %rax
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %rbp, %rcx
-; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
-; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r8, (%rdx)
-; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -120(%rsp,%rsi), %r10
+; FALLBACK10-NEXT:    movq -112(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r8, %rdi
+; FALLBACK10-NEXT:    movq -104(%rsp,%rsi), %r11
+; FALLBACK10-NEXT:    shrxq %rcx, %r11, %rbx
+; FALLBACK10-NEXT:    movq -96(%rsp,%rsi), %r14
+; FALLBACK10-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK10-NEXT:    shlxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    shrxq %rcx, %r9, %rbx
+; FALLBACK10-NEXT:    addq %r11, %r11
+; FALLBACK10-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %rbx, %r11
+; FALLBACK10-NEXT:    movq -88(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT:    shrxq %rcx, %rbx, %r15
+; FALLBACK10-NEXT:    movq -80(%rsp,%rsi), %r12
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK10-NEXT:    shlxq %rax, %r13, %r13
+; FALLBACK10-NEXT:    orq %r15, %r13
+; FALLBACK10-NEXT:    shrxq %rcx, %r14, %r14
+; FALLBACK10-NEXT:    addq %rbx, %rbx
+; FALLBACK10-NEXT:    shlxq %rax, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r14, %rbx
+; FALLBACK10-NEXT:    shrxq %rcx, %r12, %r14
+; FALLBACK10-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT:    leaq (%rsi,%rsi), %r15
+; FALLBACK10-NEXT:    shlxq %rax, %r15, %r15
+; FALLBACK10-NEXT:    orq %r14, %r15
+; FALLBACK10-NEXT:    shrxq %rcx, %r10, %r10
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK10-NEXT:    orq %r10, %rax
+; FALLBACK10-NEXT:    shrxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
 ; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    pushq %rbx
 ; FALLBACK12-NEXT:    pushq %rax
 ; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK12-NEXT:    movl (%rsi), %r9d
+; FALLBACK12-NEXT:    movl (%rsi), %r10d
 ; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT:    leal (,%r9,8), %eax
+; FALLBACK12-NEXT:    leal (,%r10,8), %eax
 ; FALLBACK12-NEXT:    andl $56, %eax
-; FALLBACK12-NEXT:    andl $56, %r9d
-; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
-; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT:    andl $56, %r10d
+; FALLBACK12-NEXT:    movq -128(%rsp,%r10), %r9
+; FALLBACK12-NEXT:    movq -120(%rsp,%r10), %r8
 ; FALLBACK12-NEXT:    movl %eax, %ecx
-; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    shrq %cl, %r9
 ; FALLBACK12-NEXT:    movl %eax, %esi
 ; FALLBACK12-NEXT:    notb %sil
 ; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %rdi
-; FALLBACK12-NEXT:    orq %r10, %rdi
-; FALLBACK12-NEXT:    movq -104(%rsp,%r9), %r10
-; FALLBACK12-NEXT:    movq %r10, %rbx
+; FALLBACK12-NEXT:    orq %r9, %rdi
+; FALLBACK12-NEXT:    movq -104(%rsp,%r10), %r9
+; FALLBACK12-NEXT:    movq %r9, %rbx
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %rbx
-; FALLBACK12-NEXT:    movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT:    movq -96(%rsp,%r10), %r12
 ; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r11
 ; FALLBACK12-NEXT:    orq %rbx, %r11
-; FALLBACK12-NEXT:    movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT:    movq -112(%rsp,%r10), %rbx
 ; FALLBACK12-NEXT:    movq %rbx, %r14
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r14
-; FALLBACK12-NEXT:    addq %r10, %r10
+; FALLBACK12-NEXT:    addq %r9, %r9
 ; FALLBACK12-NEXT:    movl %esi, %ecx
-; FALLBACK12-NEXT:    shlq %cl, %r10
-; FALLBACK12-NEXT:    orq %r14, %r10
-; FALLBACK12-NEXT:    movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT:    shlq %cl, %r9
+; FALLBACK12-NEXT:    orq %r14, %r9
+; FALLBACK12-NEXT:    movq -88(%rsp,%r10), %r14
 ; FALLBACK12-NEXT:    movq %r14, %r13
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r13
-; FALLBACK12-NEXT:    movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT:    movq -80(%rsp,%r10), %rbp
 ; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r15
@@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    orq %r12, %r14
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %rbp
-; FALLBACK12-NEXT:    movq -72(%rsp,%r9), %r9
-; FALLBACK12-NEXT:    leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT:    movq -72(%rsp,%r10), %r10
+; FALLBACK12-NEXT:    leaq (%r10,%r10), %r12
 ; FALLBACK12-NEXT:    movl %esi, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r12
 ; FALLBACK12-NEXT:    orq %rbp, %r12
@@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    shlq %cl, %rbx
 ; FALLBACK12-NEXT:    orq %r8, %rbx
 ; FALLBACK12-NEXT:    movl %eax, %ecx
-; FALLBACK12-NEXT:    shrq %cl, %r9
-; FALLBACK12-NEXT:    movq %r9, 56(%rdx)
+; FALLBACK12-NEXT:    shrq %cl, %r10
+; FALLBACK12-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
 ; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
 ; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
 ; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
-; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
+; FALLBACK12-NEXT:    movq %r9, 16(%rdx)
 ; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
 ; FALLBACK12-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK12-NEXT:    addq $8, %rsp
@@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: lshr_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
 ; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
 ; FALLBACK14-NEXT:    movl (%rsi), %esi
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
+; FALLBACK14-NEXT:    andl $56, %eax
+; FALLBACK14-NEXT:    movl %eax, %ecx
 ; FALLBACK14-NEXT:    andl $56, %esi
-; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
-; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
-; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
-; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r13
-; FALLBACK14-NEXT:    shrxq %rcx, %rax, %r9
-; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
-; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
-; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
-; FALLBACK14-NEXT:    movl %ecx, %ebx
-; FALLBACK14-NEXT:    notb %bl
-; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
-; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r8, %rdi
+; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shrxq %rcx, %r11, %rbx
+; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r14
+; FALLBACK14-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK14-NEXT:    shlxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    shrxq %rcx, %r9, %rbx
+; FALLBACK14-NEXT:    addq %r11, %r11
+; FALLBACK14-NEXT:    shlxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %rbx, %r11
+; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT:    shrxq %rcx, %rbx, %r15
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
-; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
-; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK14-NEXT:    shlxq %rax, %r13, %r13
+; FALLBACK14-NEXT:    orq %r15, %r13
+; FALLBACK14-NEXT:    shrxq %rcx, %r14, %r14
+; FALLBACK14-NEXT:    addq %rbx, %rbx
+; FALLBACK14-NEXT:    shlxq %rax, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r14, %rbx
+; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r14
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT:    leaq (%rsi,%rsi), %r15
+; FALLBACK14-NEXT:    shlxq %rax, %r15, %r15
+; FALLBACK14-NEXT:    orq %r14, %r15
+; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r10
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rax, %r9, %rax
+; FALLBACK14-NEXT:    orq %r10, %rax
 ; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %r9, %rdi
-; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT:    orq %r14, %r9
-; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT:    orq %r15, %r10
-; FALLBACK14-NEXT:    addq %rsi, %rsi
-; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
-; FALLBACK14-NEXT:    orq %r13, %rsi
-; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT:    orq %rbp, %rax
 ; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rsi, 48(%rdx)
-; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r8, (%rdx)
-; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
 ; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    pushq %r14
 ; FALLBACK15-NEXT:    pushq %rbx
 ; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    movl (%rsi), %edi
 ; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK15-NEXT:    leal (,%rdi,8), %ecx
 ; FALLBACK15-NEXT:    andl $56, %ecx
-; FALLBACK15-NEXT:    andl $56, %eax
-; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq %r9, %rsi
-; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    andl $56, %edi
+; FALLBACK15-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq %r9, %rax
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK15-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK15-NEXT:    movq %r10, %r8
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    movq %r11, %rbx
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r15
 ; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
 ; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
 ; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r14
 ; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
-; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK15-NEXT:    movq %r14, (%rdx)
 ; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK15-NEXT:    popq %rbx
@@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 36(%eax), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 40(%eax), %ebp
-; FALLBACK18-NEXT:    movl 44(%eax), %ebx
+; FALLBACK18-NEXT:    movl 40(%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 44(%eax), %ebp
 ; FALLBACK18-NEXT:    movl 48(%eax), %edi
 ; FALLBACK18-NEXT:    movl 52(%eax), %esi
 ; FALLBACK18-NEXT:    movl 56(%eax), %edx
 ; FALLBACK18-NEXT:    movl 60(%eax), %ecx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl (%eax), %eax
+; FALLBACK18-NEXT:    movl (%eax), %ebx
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %eax, %ecx
-; FALLBACK18-NEXT:    leal (,%eax,8), %edx
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    leal (,%ebx,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
-; FALLBACK18-NEXT:    andl $60, %ecx
-; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl %edx, %ecx
+; FALLBACK18-NEXT:    andl $60, %ebx
+; FALLBACK18-NEXT:    movl 68(%esp,%ebx), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK18-NEXT:    movl %edx, %ebx
-; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 80(%esp,%ebx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 88(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl 96(%esp,%ebx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 92(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 104(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK18-NEXT:    movl 100(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT:    movl 112(%esp,%ebx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    movl 108(%esp,%ebx), %esi
+; FALLBACK18-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ecx, %ebp
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    orl %edi, %ecx
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %ecx
+; FALLBACK18-NEXT:    orl %eax, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 120(%esp,%ebx), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK18-NEXT:    movl 116(%esp,%ebx), %eax
+; FALLBACK18-NEXT:    movl %ebp, %ecx
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %esi
+; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ecx, %ebp
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %edx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%ebx), %eax
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    shrxl %ebp, %eax, %edi
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %edx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK18-NEXT:    movl %edi, 48(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl %edi, 60(%eax)
+; FALLBACK18-NEXT:    movl %edx, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK18-NEXT:    movl %esi, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %ebx
 ; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK22-NEXT:    leal (,%ebx,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
-; FALLBACK22-NEXT:    andl $60, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl %edx, %ecx
+; FALLBACK22-NEXT:    andl $60, %ebx
+; FALLBACK22-NEXT:    movl 68(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK22-NEXT:    movl %edx, %ebx
-; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    notb %dl
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT:    orl %edi, %ebp
-; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edi, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%ebx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 88(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 96(%esp,%ebx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 92(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 104(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    movl 100(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
-; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl 112(%esp,%ebx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    movl 108(%esp,%ebx), %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ecx, %ebp
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %ecx
+; FALLBACK22-NEXT:    orl %eax, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 120(%esp,%ebx), %edi
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 116(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    movl %ebp, %ecx
+; FALLBACK22-NEXT:    shrxl %ebp, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %ebp
 ; FALLBACK22-NEXT:    addl %eax, %eax
-; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
-; FALLBACK22-NEXT:    addl %ebp, %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%ebx), %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %edi, %edi
+; FALLBACK22-NEXT:    orl %edi, %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %eax, %edi
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl %edx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK22-NEXT:    movl %edi, 48(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl %edi, 60(%eax)
+; FALLBACK22-NEXT:    movl %edx, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK22-NEXT:    movl %esi, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
 ; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK26-NEXT:    movl %edx, %ebx
-; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    notb %dl
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %ebp
 ; FALLBACK26-NEXT:    orl %edi, %ebp
 ; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    orl %edi, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %ebp
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    orl %eax, %ebp
+; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %eax, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %esi
 ; FALLBACK26-NEXT:    movl 116(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %edi
 ; FALLBACK26-NEXT:    orl %edi, %esi
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %eax, %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl 124(%esp,%ecx), %ecx
-; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edx
-; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebx
-; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %edi, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %edi
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT:    movl %edx, 60(%ecx)
-; FALLBACK26-NEXT:    movl %ebx, 56(%ecx)
-; FALLBACK26-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK26-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK26-NEXT:    movl %edx, 56(%ecx)
+; FALLBACK26-NEXT:    movl %eax, 48(%ecx)
 ; FALLBACK26-NEXT:    movl %esi, 52(%ecx)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK26-NEXT:    movl %ebp, 40(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %eax, 44(%ecx)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
-; FALLBACK30-NEXT:    movl (%eax), %edx
+; FALLBACK30-NEXT:    movl (%eax), %ecx
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
-; FALLBACK30-NEXT:    andl $24, %ecx
-; FALLBACK30-NEXT:    andl $60, %edx
-; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
-; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    andl $60, %ecx
+; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
-; FALLBACK30-NEXT:    movl %ecx, %ebx
-; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    notb %dl
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %ebp
 ; FALLBACK30-NEXT:    orl %edi, %ebp
 ; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    orl %edi, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
-; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 120(%esp,%edx), %ebp
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %esi
-; FALLBACK30-NEXT:    movl 116(%esp,%edx), %eax
-; FALLBACK30-NEXT:    shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    orl %eax, %ebp
+; FALLBACK30-NEXT:    movl 120(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %eax, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%ecx), %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %edi
 ; FALLBACK30-NEXT:    orl %edi, %esi
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %eax, %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %eax
-; FALLBACK30-NEXT:    movl 124(%esp,%edx), %edx
-; FALLBACK30-NEXT:    shrxl %ecx, %edx, %ebp
-; FALLBACK30-NEXT:    leal (%edx,%edx), %ecx
-; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %edx
-; FALLBACK30-NEXT:    orl %eax, %edx
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %eax
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl 124(%esp,%ecx), %ecx
+; FALLBACK30-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %edi, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %edi
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT:    movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT:    movl %edi, 60(%ecx)
 ; FALLBACK30-NEXT:    movl %edx, 56(%ecx)
-; FALLBACK30-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK30-NEXT:    movl %eax, 48(%ecx)
 ; FALLBACK30-NEXT:    movl %esi, 52(%ecx)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK30-NEXT:    movl %ebp, 40(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %eax, 44(%ecx)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: shl_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK2-NEXT:    andl $56, %eax
+; FALLBACK2-NEXT:    movl %eax, %ecx
 ; FALLBACK2-NEXT:    andl $56, %esi
 ; FALLBACK2-NEXT:    negl %esi
 ; FALLBACK2-NEXT:    movslq %esi, %rsi
-; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r10
-; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r9
-; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r14
-; FALLBACK2-NEXT:    shlxq %rax, %r14, %rbx
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r8
-; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
-; FALLBACK2-NEXT:    shlxq %rax, %r10, %r12
-; FALLBACK2-NEXT:    movl %eax, %r13d
-; FALLBACK2-NEXT:    notb %r13b
-; FALLBACK2-NEXT:    shrq %r10
-; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
-; FALLBACK2-NEXT:    orq %r9, %r10
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
-; FALLBACK2-NEXT:    shrq %r14
-; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
-; FALLBACK2-NEXT:    orq %r11, %r14
-; FALLBACK2-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
-; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rax
-; FALLBACK2-NEXT:    shrq %rcx
-; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
-; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %r8
+; FALLBACK2-NEXT:    notb %al
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r10
 ; FALLBACK2-NEXT:    shrq %r9
-; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
-; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK2-NEXT:    orq %r8, %r9
+; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %rbx
+; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r14
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK2-NEXT:    orq %rbx, %r8
 ; FALLBACK2-NEXT:    shrq %rdi
-; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %rbp, %rdi
+; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r14, %rdi
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rbx
+; FALLBACK2-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r15
+; FALLBACK2-NEXT:    shlxq %rcx, %r15, %r12
+; FALLBACK2-NEXT:    shrq %r15
+; FALLBACK2-NEXT:    shrxq %rax, %r15, %r15
+; FALLBACK2-NEXT:    orq %r14, %r15
+; FALLBACK2-NEXT:    shrq %r11
+; FALLBACK2-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK2-NEXT:    orq %r12, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
+; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT:    shlxq %rcx, %rsi, %rcx
 ; FALLBACK2-NEXT:    shrq %rsi
-; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    shrq %r8
-; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
-; FALLBACK2-NEXT:    orq %rax, %r8
-; FALLBACK2-NEXT:    movq %r12, (%rdx)
-; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT:    orq %r14, %rsi
+; FALLBACK2-NEXT:    shrq %rbx
+; FALLBACK2-NEXT:    shrxq %rax, %rbx, %rax
+; FALLBACK2-NEXT:    orq %rcx, %rax
+; FALLBACK2-NEXT:    movq %r10, (%rdx)
+; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
 ; FALLBACK2-NEXT:    movq %rsi, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rcx, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r14, 24(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r15, 40(%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: shl_64bytes:
@@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: shl_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
-; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    subq $24, %rsp
+; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
 ; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
-; FALLBACK6-NEXT:    movl (%rsi), %eax
+; FALLBACK6-NEXT:    movl (%rsi), %esi
 ; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    movaps %xmm3, (%rsp)
+; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    negl %eax
-; FALLBACK6-NEXT:    movslq %eax, %rsi
-; FALLBACK6-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK6-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK6-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK6-NEXT:    movl %ecx, %r9d
-; FALLBACK6-NEXT:    notb %r9b
+; FALLBACK6-NEXT:    movl %eax, %ecx
+; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    negl %esi
+; FALLBACK6-NEXT:    movslq %esi, %rsi
+; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK6-NEXT:    notb %al
+; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK6-NEXT:    shrq %r8
+; FALLBACK6-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK6-NEXT:    orq %r9, %r8
+; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK6-NEXT:    shrq %r9
+; FALLBACK6-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK6-NEXT:    orq %r10, %r9
+; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK6-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK6-NEXT:    shrq %r10
+; FALLBACK6-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK6-NEXT:    orq %r11, %r10
+; FALLBACK6-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK6-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK6-NEXT:    shrq %r11
+; FALLBACK6-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK6-NEXT:    orq %r14, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK6-NEXT:    shrq %rbx
+; FALLBACK6-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r15, %rbx
+; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK6-NEXT:    shrq %rdi
-; FALLBACK6-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK6-NEXT:    orq %r12, %rdi
-; FALLBACK6-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK6-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK6-NEXT:    shrq %r13
-; FALLBACK6-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK6-NEXT:    orq %r15, %r12
-; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK6-NEXT:    shrq %r11
-; FALLBACK6-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK6-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK6-NEXT:    shrq %r14
-; FALLBACK6-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK6-NEXT:    orq %r10, %r14
-; FALLBACK6-NEXT:    shrq %rsi
-; FALLBACK6-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK6-NEXT:    orq %rbx, %rsi
-; FALLBACK6-NEXT:    shrq %rax
-; FALLBACK6-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK6-NEXT:    orq %r8, %rax
-; FALLBACK6-NEXT:    shrq %rbp
-; FALLBACK6-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK6-NEXT:    orq %r15, %r8
-; FALLBACK6-NEXT:    movq %rcx, (%rdx)
-; FALLBACK6-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK6-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK6-NEXT:    addq $24, %rsp
+; FALLBACK6-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK6-NEXT:    shrq %r15
+; FALLBACK6-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK6-NEXT:    orq %rcx, %rax
+; FALLBACK6-NEXT:    movq %r14, (%rdx)
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK6-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK6-NEXT:    addq $8, %rsp
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
-; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: shl_64bytes:
@@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: shl_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
-; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    subq $24, %rsp
+; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT:    movl (%rsi), %eax
+; FALLBACK10-NEXT:    movl (%rsi), %esi
 ; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    negl %eax
-; FALLBACK10-NEXT:    movslq %eax, %rsi
-; FALLBACK10-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK10-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK10-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK10-NEXT:    movl %ecx, %r9d
-; FALLBACK10-NEXT:    notb %r9b
+; FALLBACK10-NEXT:    movl %eax, %ecx
+; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    negl %esi
+; FALLBACK10-NEXT:    movslq %esi, %rsi
+; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK10-NEXT:    notb %al
+; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK10-NEXT:    shrq %r8
+; FALLBACK10-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK10-NEXT:    orq %r9, %r8
+; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK10-NEXT:    shrq %r9
+; FALLBACK10-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK10-NEXT:    orq %r10, %r9
+; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK10-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK10-NEXT:    shrq %r10
+; FALLBACK10-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK10-NEXT:    orq %r11, %r10
+; FALLBACK10-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK10-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK10-NEXT:    shrq %r11
+; FALLBACK10-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK10-NEXT:    orq %r14, %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK10-NEXT:    shrq %rbx
+; FALLBACK10-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r15, %rbx
+; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK10-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK10-NEXT:    shrq %rdi
-; FALLBACK10-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK10-NEXT:    orq %r12, %rdi
-; FALLBACK10-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK10-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK10-NEXT:    shrq %r13
-; FALLBACK10-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK10-NEXT:    orq %r15, %r12
-; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK10-NEXT:    shrq %r11
-; FALLBACK10-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK10-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK10-NEXT:    shrq %r14
-; FALLBACK10-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK10-NEXT:    orq %r10, %r14
-; FALLBACK10-NEXT:    shrq %rsi
-; FALLBACK10-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK10-NEXT:    orq %rbx, %rsi
-; FALLBACK10-NEXT:    shrq %rax
-; FALLBACK10-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK10-NEXT:    orq %r8, %rax
-; FALLBACK10-NEXT:    shrq %rbp
-; FALLBACK10-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK10-NEXT:    orq %r15, %r8
-; FALLBACK10-NEXT:    movq %rcx, (%rdx)
-; FALLBACK10-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK10-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK10-NEXT:    addq $24, %rsp
+; FALLBACK10-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK10-NEXT:    shrq %r15
+; FALLBACK10-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK10-NEXT:    orq %rcx, %rax
+; FALLBACK10-NEXT:    movq %r14, (%rdx)
+; FALLBACK10-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK10-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK10-NEXT:    addq $8, %rsp
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
-; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: shl_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
-; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    subq $24, %rsp
+; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK14-NEXT:    movl (%rsi), %eax
+; FALLBACK14-NEXT:    movl (%rsi), %esi
 ; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
 ; FALLBACK14-NEXT:    andl $56, %eax
-; FALLBACK14-NEXT:    negl %eax
-; FALLBACK14-NEXT:    movslq %eax, %rsi
-; FALLBACK14-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r12
-; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r15
-; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r13
-; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r8
-; FALLBACK14-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r11
-; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r10
-; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r14
-; FALLBACK14-NEXT:    shlxq %rcx, %r14, %rbx
-; FALLBACK14-NEXT:    movl %ecx, %r9d
-; FALLBACK14-NEXT:    notb %r9b
+; FALLBACK14-NEXT:    movl %eax, %ecx
+; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    negl %esi
+; FALLBACK14-NEXT:    movslq %esi, %rsi
+; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r9
+; FALLBACK14-NEXT:    notb %al
+; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r10
+; FALLBACK14-NEXT:    shrq %r8
+; FALLBACK14-NEXT:    shrxq %rax, %r8, %r8
+; FALLBACK14-NEXT:    orq %r9, %r8
+; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %r11
+; FALLBACK14-NEXT:    shrq %r9
+; FALLBACK14-NEXT:    shrxq %rax, %r9, %r9
+; FALLBACK14-NEXT:    orq %r10, %r9
+; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %r10
+; FALLBACK14-NEXT:    shlxq %rcx, %r10, %r14
+; FALLBACK14-NEXT:    shrq %r10
+; FALLBACK14-NEXT:    shrxq %rax, %r10, %r10
+; FALLBACK14-NEXT:    orq %r11, %r10
+; FALLBACK14-NEXT:    movq -64(%rsp,%rsi), %rbx
+; FALLBACK14-NEXT:    movq -56(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r15
+; FALLBACK14-NEXT:    shrq %r11
+; FALLBACK14-NEXT:    shrxq %rax, %r11, %r11
+; FALLBACK14-NEXT:    orq %r14, %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %rbx, %r14
+; FALLBACK14-NEXT:    shrq %rbx
+; FALLBACK14-NEXT:    shrxq %rax, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r15, %rbx
+; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %r15
+; FALLBACK14-NEXT:    shlxq %rcx, %r15, %r12
 ; FALLBACK14-NEXT:    shrq %rdi
-; FALLBACK14-NEXT:    shrxq %r9, %rdi, %rdi
+; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
 ; FALLBACK14-NEXT:    orq %r12, %rdi
-; FALLBACK14-NEXT:    movq (%rsp,%rsi), %rbp
-; FALLBACK14-NEXT:    shlxq %rcx, %rbp, %r8
-; FALLBACK14-NEXT:    shrq %r13
-; FALLBACK14-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK14-NEXT:    orq %r15, %r12
-; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
-; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT:    shrq %r11
-; FALLBACK14-NEXT:    shrxq %r9, %r11, %r11
-; FALLBACK14-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK14-NEXT:    shrq %r14
-; FALLBACK14-NEXT:    shrxq %r9, %r14, %r14
-; FALLBACK14-NEXT:    orq %r10, %r14
-; FALLBACK14-NEXT:    shrq %rsi
-; FALLBACK14-NEXT:    shrxq %r9, %rsi, %rsi
-; FALLBACK14-NEXT:    orq %rbx, %rsi
-; FALLBACK14-NEXT:    shrq %rax
-; FALLBACK14-NEXT:    shrxq %r9, %rax, %rax
-; FALLBACK14-NEXT:    orq %r8, %rax
-; FALLBACK14-NEXT:    shrq %rbp
-; FALLBACK14-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK14-NEXT:    orq %r15, %r8
-; FALLBACK14-NEXT:    movq %rcx, (%rdx)
-; FALLBACK14-NEXT:    movq %r8, 56(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
-; FALLBACK14-NEXT:    movq %r14, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r12, 32(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 40(%rdx)
-; FALLBACK14-NEXT:    addq $24, %rsp
+; FALLBACK14-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
+; FALLBACK14-NEXT:    shrq %r15
+; FALLBACK14-NEXT:    shrxq %rax, %r15, %rax
+; FALLBACK14-NEXT:    orq %rcx, %rax
+; FALLBACK14-NEXT:    movq %r14, (%rdx)
+; FALLBACK14-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 8(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK14-NEXT:    movq %r9, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 40(%rdx)
+; FALLBACK14-NEXT:    addq $8, %rsp
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
-; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
-; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    leal (,%ebp,8), %ebx
+; FALLBACK18-NEXT:    andl $24, %ebx
+; FALLBACK18-NEXT:    movl %ebx, %eax
 ; FALLBACK18-NEXT:    andl $60, %ebp
 ; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK18-NEXT:    subl %ebp, %edi
-; FALLBACK18-NEXT:    movl (%edi), %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 4(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT:    subl %ebp, %edx
+; FALLBACK18-NEXT:    movl (%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 4(%edx), %ecx
 ; FALLBACK18-NEXT:    notb %bl
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK18-NEXT:    orl %ecx, %esi
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %esi
+; FALLBACK18-NEXT:    movl %eax, %ebp
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 8(%edx), %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 8(%edi), %esi
-; FALLBACK18-NEXT:    movl %esi, %ecx
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 12(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 16(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 20(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 12(%edx), %esi
+; FALLBACK18-NEXT:    movl %ebp, %edi
+; FALLBACK18-NEXT:    shlxl %ebp, %esi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK18-NEXT:    orl %eax, %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 24(%edi), %ecx
+; FALLBACK18-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 28(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    movl 20(%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 32(%edi), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 36(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT:    movl 24(%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrl %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    movl 28(%edx), %esi
+; FALLBACK18-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 40(%edi), %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT:    movl 44(%edi), %ecx
-; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT:    movl 36(%edx), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl %edi, %eax
 ; FALLBACK18-NEXT:    shrl %esi
 ; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%edi), %esi
+; FALLBACK18-NEXT:    orl %ebp, %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%edx), %edi
+; FALLBACK18-NEXT:    movl %edi, %esi
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK18-NEXT:    movl 52(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK18-NEXT:    movl 44(%edx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK18-NEXT:    movl %eax, %esi
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %eax, %ebp
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 48(%edx), %ebp
+; FALLBACK18-NEXT:    movl %ebp, %edi
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    movl 52(%edx), %ecx
+; FALLBACK18-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK18-NEXT:    movl %esi, %ebp
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    negl %eax
-; FALLBACK18-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK18-NEXT:    movl 56(%edi), %eax
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %edx, %esi
 ; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, (%eax)
-; FALLBACK18-NEXT:    movl %esi, 56(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT:    orl %edi, %esi
+; FALLBACK18-NEXT:    movl 56(%edx), %edi
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    shrl %edi
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK18-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK18-NEXT:    negl %ebx
+; FALLBACK18-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK18-NEXT:    orl %ecx, %ebx
+; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK18-NEXT:    movl %edi, (%edx)
+; FALLBACK18-NEXT:    movl %eax, 56(%edx)
+; FALLBACK18-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK18-NEXT:    movl %esi, 48(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 52(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 40(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 44(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 32(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 36(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 24(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 28(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 16(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 20(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 8(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 12(%edx)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK18-NEXT:    addl $204, %esp
 ; FALLBACK18-NEXT:    popl %esi
 ; FALLBACK18-NEXT:    popl %edi
@@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%eax,8), %edx
-; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK22-NEXT:    andl $24, %ebx
+; FALLBACK22-NEXT:    movl %ebx, %ecx
 ; FALLBACK22-NEXT:    andl $60, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK22-NEXT:    subl %eax, %edi
-; FALLBACK22-NEXT:    movl (%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 4(%edi), %eax
+; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    subl %eax, %edx
+; FALLBACK22-NEXT:    movl (%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 4(%edx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 8(%edx), %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 8(%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, %ecx
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 12(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 16(%edi), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 20(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 12(%edx), %esi
+; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %ecx, %edi
+; FALLBACK22-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK22-NEXT:    orl %eax, %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 24(%edi), %ecx
+; FALLBACK22-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 28(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    movl 20(%edx), %ecx
+; FALLBACK22-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 32(%edi), %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 36(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    movl 24(%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrl %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    movl 28(%edx), %esi
+; FALLBACK22-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 40(%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT:    movl 44(%edi), %ecx
-; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT:    movl 36(%edx), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %edi, %eax
 ; FALLBACK22-NEXT:    shrl %esi
 ; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%edi), %esi
+; FALLBACK22-NEXT:    orl %ebp, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 40(%edx), %edi
+; FALLBACK22-NEXT:    movl %edi, %esi
 ; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK22-NEXT:    movl 52(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    movl 44(%edx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK22-NEXT:    movl %eax, %esi
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %eax, %ebp
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 48(%edx), %ebp
+; FALLBACK22-NEXT:    movl %ebp, %edi
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    movl 52(%edx), %ecx
+; FALLBACK22-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK22-NEXT:    movl %esi, %ebp
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT:    negl %eax
-; FALLBACK22-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK22-NEXT:    movl 56(%edi), %eax
-; FALLBACK22-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edx, %esi
 ; FALLBACK22-NEXT:    shrl %eax
-; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %edx, (%eax)
-; FALLBACK22-NEXT:    movl %esi, 56(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK22-NEXT:    orl %edi, %esi
+; FALLBACK22-NEXT:    movl 56(%edx), %edi
+; FALLBACK22-NEXT:    shrl %ecx
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    shrl %edi
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK22-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK22-NEXT:    negl %ebx
+; FALLBACK22-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK22-NEXT:    orl %ecx, %ebx
+; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT:    movl %edi, (%edx)
+; FALLBACK22-NEXT:    movl %eax, 56(%edx)
+; FALLBACK22-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK22-NEXT:    movl %esi, 48(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 52(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 40(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 44(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 32(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 36(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 24(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 28(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 16(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 20(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 8(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 12(%edx)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK22-NEXT:    addl $204, %esp
 ; FALLBACK22-NEXT:    popl %esi
 ; FALLBACK22-NEXT:    popl %edi
@@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    leal (,%eax,8), %edx
-; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK26-NEXT:    andl $24, %ebx
+; FALLBACK26-NEXT:    movl %ebx, %ecx
 ; FALLBACK26-NEXT:    andl $60, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK26-NEXT:    subl %eax, %edi
-; FALLBACK26-NEXT:    movl (%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 4(%edi), %eax
+; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    subl %eax, %edx
+; FALLBACK26-NEXT:    movl (%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 4(%edx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK26-NEXT:    orl %ecx, %esi
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 8(%edx), %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 8(%edi), %esi
-; FALLBACK26-NEXT:    movl %esi, %ecx
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 12(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 16(%edi), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 20(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 12(%edx), %esi
+; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %ecx, %edi
+; FALLBACK26-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK26-NEXT:    orl %eax, %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 24(%edi), %ecx
+; FALLBACK26-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 28(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    movl 20(%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 32(%edi), %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 36(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT:    movl 24(%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrl %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    movl 28(%edx), %esi
+; FALLBACK26-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 40(%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 32(%edx), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT:    movl 44(%edi), %ecx
-; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT:    movl 36(%edx), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl %edi, %eax
 ; FALLBACK26-NEXT:    shrl %esi
 ; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%edi), %esi
+; FALLBACK26-NEXT:    orl %ebp, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 40(%edx), %edi
+; FALLBACK26-NEXT:    movl %edi, %esi
 ; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK26-NEXT:    movl 52(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK26-NEXT:    movl 44(%edx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK26-NEXT:    movl %eax, %esi
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %eax, %ebp
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 48(%edx), %ebp
+; FALLBACK26-NEXT:    movl %ebp, %edi
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    movl 52(%edx), %ecx
+; FALLBACK26-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK26-NEXT:    movl %esi, %ebp
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT:    negl %eax
-; FALLBACK26-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK26-NEXT:    movl 56(%edi), %eax
-; FALLBACK26-NEXT:    shlxl %edx, %eax, %edx
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %edx, %esi
 ; FALLBACK26-NEXT:    shrl %eax
-; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %edx, (%eax)
-; FALLBACK26-NEXT:    movl %esi, 56(%eax)
-; FALLBACK26-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK26-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK26-NEXT:    orl %edi, %esi
+; FALLBACK26-NEXT:    movl 56(%edx), %edi
+; FALLBACK26-NEXT:    shrl %ecx
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    shrl %edi
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK26-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK26-NEXT:    negl %ebx
+; FALLBACK26-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK26-NEXT:    orl %ecx, %ebx
+; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT:    movl %edi, (%edx)
+; FALLBACK26-NEXT:    movl %eax, 56(%edx)
+; FALLBACK26-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK26-NEXT:    movl %esi, 48(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 52(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 40(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 44(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 32(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 36(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 24(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 28(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 16(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 20(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 8(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 12(%edx)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK26-NEXT:    addl $204, %esp
 ; FALLBACK26-NEXT:    popl %esi
 ; FALLBACK26-NEXT:    popl %edi
@@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    leal (,%eax,8), %edx
-; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    leal (,%eax,8), %ebx
+; FALLBACK30-NEXT:    andl $24, %ebx
+; FALLBACK30-NEXT:    movl %ebx, %ecx
 ; FALLBACK30-NEXT:    andl $60, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; FALLBACK30-NEXT:    subl %eax, %edi
-; FALLBACK30-NEXT:    movl (%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 4(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %edx, %ebx
-; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
-; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
-; FALLBACK30-NEXT:    orl %ecx, %esi
+; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    subl %eax, %edx
+; FALLBACK30-NEXT:    movl (%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 8(%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, %ecx
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 12(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 16(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 20(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 24(%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 28(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl 4(%edx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, %eax, %esi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 8(%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 32(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %eax
-; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 36(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    movl 12(%edx), %esi
+; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %ecx, %edi
+; FALLBACK30-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
 ; FALLBACK30-NEXT:    orl %eax, %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 40(%edi), %ecx
+; FALLBACK30-NEXT:    movl 16(%edx), %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT:    movl 44(%edi), %ecx
-; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT:    movl 20(%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %edi, %ecx, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%edi), %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%edx), %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrl %esi
 ; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
-; FALLBACK30-NEXT:    movl 52(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT:    movl 28(%edx), %esi
+; FALLBACK30-NEXT:    shlxl %edi, %esi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %eax, %ebp
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT:    negl %eax
-; FALLBACK30-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK30-NEXT:    movl 56(%edi), %eax
-; FALLBACK30-NEXT:    shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT:    movl 32(%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    movl 36(%edx), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edi, %ecx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl %edi, %eax
 ; FALLBACK30-NEXT:    shrl %esi
 ; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %edx, %esi
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 40(%edx), %edi
+; FALLBACK30-NEXT:    movl %edi, %esi
+; FALLBACK30-NEXT:    shrl %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %ecx
+; FALLBACK30-NEXT:    movl 44(%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %esi, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %eax, %edi, %edi
+; FALLBACK30-NEXT:    movl %eax, %esi
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    shrl %eax
 ; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %edx, (%eax)
-; FALLBACK30-NEXT:    movl %esi, 56(%eax)
-; FALLBACK30-NEXT:    movl %ecx, 60(%eax)
-; FALLBACK30-NEXT:    movl %ebp, 48(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 48(%edx), %ebp
+; FALLBACK30-NEXT:    movl %ebp, %edi
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    movl 52(%edx), %ecx
+; FALLBACK30-NEXT:    shlxl %esi, %ecx, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %esi, %ebp, %edi
+; FALLBACK30-NEXT:    movl %esi, %ebp
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %esi
+; FALLBACK30-NEXT:    orl %edi, %esi
+; FALLBACK30-NEXT:    movl 56(%edx), %edi
+; FALLBACK30-NEXT:    shrl %ecx
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    shlxl %ebp, %edi, %ecx
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    shrl %edi
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ecx
+; FALLBACK30-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK30-NEXT:    negl %ebx
+; FALLBACK30-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; FALLBACK30-NEXT:    orl %ecx, %ebx
+; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT:    movl %edi, (%edx)
+; FALLBACK30-NEXT:    movl %eax, 56(%edx)
+; FALLBACK30-NEXT:    movl %ebx, 60(%edx)
+; FALLBACK30-NEXT:    movl %esi, 48(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 52(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 40(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 44(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 32(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 36(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 24(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 28(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 16(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 20(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 8(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 12(%edx)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl %eax, 4(%edx)
 ; FALLBACK30-NEXT:    addl $204, %esp
 ; FALLBACK30-NEXT:    popl %esi
 ; FALLBACK30-NEXT:    popl %edi
@@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK2-LABEL: ashr_64bytes:
 ; FALLBACK2:       # %bb.0:
-; FALLBACK2-NEXT:    pushq %rbp
 ; FALLBACK2-NEXT:    pushq %r15
 ; FALLBACK2-NEXT:    pushq %r14
-; FALLBACK2-NEXT:    pushq %r13
 ; FALLBACK2-NEXT:    pushq %r12
 ; FALLBACK2-NEXT:    pushq %rbx
 ; FALLBACK2-NEXT:    pushq %rax
@@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
 ; FALLBACK2-NEXT:    andl $56, %ecx
+; FALLBACK2-NEXT:    movl %ecx, %esi
 ; FALLBACK2-NEXT:    andl $56, %eax
-; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT:    movl %ecx, %r12d
-; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %r8
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r8, %r9
+; FALLBACK2-NEXT:    notb %cl
+; FALLBACK2-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK2-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK2-NEXT:    orq %r9, %rdi
+; FALLBACK2-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    addq %r8, %r8
+; FALLBACK2-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK2-NEXT:    orq %r9, %r8
+; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK2-NEXT:    leaq (%r14,%r14), %r9
+; FALLBACK2-NEXT:    shlxq %rcx, %r9, %r9
 ; FALLBACK2-NEXT:    orq %rbx, %r9
-; FALLBACK2-NEXT:    addq %rdi, %rdi
-; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK2-NEXT:    orq %r10, %r11
+; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r10, %rbx
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r15
+; FALLBACK2-NEXT:    leaq (%r15,%r15), %r12
+; FALLBACK2-NEXT:    shlxq %rcx, %r12, %r12
+; FALLBACK2-NEXT:    orq %rbx, %r12
+; FALLBACK2-NEXT:    shrxq %rsi, %r14, %rbx
 ; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
-; FALLBACK2-NEXT:    addq %rsi, %rsi
-; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT:    orq %r15, %r8
-; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
-; FALLBACK2-NEXT:    addq %rax, %rax
-; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT:    orq %r13, %rax
-; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
-; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
-; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    shlxq %rcx, %r10, %r10
+; FALLBACK2-NEXT:    orq %rbx, %r10
+; FALLBACK2-NEXT:    shrxq %rsi, %r15, %rbx
+; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT:    leaq (%rax,%rax), %r14
+; FALLBACK2-NEXT:    shlxq %rcx, %r14, %rcx
+; FALLBACK2-NEXT:    orq %rbx, %rcx
+; FALLBACK2-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK2-NEXT:    movq %rax, 56(%rdx)
+; FALLBACK2-NEXT:    movq %rcx, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r12, 40(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r8, (%rdx)
+; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
-; FALLBACK2-NEXT:    popq %r13
 ; FALLBACK2-NEXT:    popq %r14
 ; FALLBACK2-NEXT:    popq %r15
-; FALLBACK2-NEXT:    popq %rbp
 ; FALLBACK2-NEXT:    retq
 ;
 ; FALLBACK3-LABEL: ashr_64bytes:
@@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK6-LABEL: ashr_64bytes:
 ; FALLBACK6:       # %bb.0:
-; FALLBACK6-NEXT:    pushq %rbp
 ; FALLBACK6-NEXT:    pushq %r15
 ; FALLBACK6-NEXT:    pushq %r14
 ; FALLBACK6-NEXT:    pushq %r13
 ; FALLBACK6-NEXT:    pushq %r12
 ; FALLBACK6-NEXT:    pushq %rbx
-; FALLBACK6-NEXT:    pushq %rax
 ; FALLBACK6-NEXT:    movups (%rdi), %xmm0
 ; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
 ; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
@@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
+; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK6-NEXT:    andl $56, %ecx
+; FALLBACK6-NEXT:    movl %ecx, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
-; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK6-NEXT:    notb %cl
+; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK6-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK6-NEXT:    orq %r8, %rdi
+; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK6-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK6-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK6-NEXT:    addq %r11, %r11
+; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK6-NEXT:    orq %rbx, %r11
+; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK6-NEXT:    orq %r15, %r13
+; FALLBACK6-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK6-NEXT:    addq %rbx, %rbx
+; FALLBACK6-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r14, %rbx
+; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %r9, %rdi
-; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT:    orq %r14, %r9
-; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT:    orq %r15, %r10
-; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT:    orq %r13, %rax
-; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT:    orq %rbp, %rcx
-; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK6-NEXT:    orq %r14, %r15
+; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK6-NEXT:    addq %r9, %r9
+; FALLBACK6-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK6-NEXT:    orq %r10, %rcx
+; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK6-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r8, (%rdx)
-; FALLBACK6-NEXT:    addq $8, %rsp
+; FALLBACK6-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK6-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK6-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK6-NEXT:    popq %rbx
 ; FALLBACK6-NEXT:    popq %r12
 ; FALLBACK6-NEXT:    popq %r13
 ; FALLBACK6-NEXT:    popq %r14
 ; FALLBACK6-NEXT:    popq %r15
-; FALLBACK6-NEXT:    popq %rbp
 ; FALLBACK6-NEXT:    retq
 ;
 ; FALLBACK7-LABEL: ashr_64bytes:
@@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK10-LABEL: ashr_64bytes:
 ; FALLBACK10:       # %bb.0:
-; FALLBACK10-NEXT:    pushq %rbp
 ; FALLBACK10-NEXT:    pushq %r15
 ; FALLBACK10-NEXT:    pushq %r14
 ; FALLBACK10-NEXT:    pushq %r13
 ; FALLBACK10-NEXT:    pushq %r12
 ; FALLBACK10-NEXT:    pushq %rbx
-; FALLBACK10-NEXT:    pushq %rax
 ; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK10-NEXT:    vmovups 32(%rdi), %xmm1
 ; FALLBACK10-NEXT:    movq 48(%rdi), %rcx
@@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
+; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK10-NEXT:    andl $56, %ecx
+; FALLBACK10-NEXT:    movl %ecx, %esi
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
-; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK10-NEXT:    notb %cl
+; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK10-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK10-NEXT:    orq %r8, %rdi
+; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK10-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK10-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK10-NEXT:    addq %r11, %r11
+; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK10-NEXT:    orq %rbx, %r11
+; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK10-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK10-NEXT:    orq %r15, %r13
+; FALLBACK10-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK10-NEXT:    addq %rbx, %rbx
+; FALLBACK10-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r14, %rbx
+; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %r9, %rdi
-; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT:    orq %r14, %r9
-; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT:    orq %r15, %r10
-; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT:    orq %r13, %rax
-; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT:    orq %rbp, %rcx
-; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK10-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK10-NEXT:    orq %r14, %r15
+; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK10-NEXT:    addq %r9, %r9
+; FALLBACK10-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK10-NEXT:    orq %r10, %rcx
+; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK10-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r8, (%rdx)
-; FALLBACK10-NEXT:    addq $8, %rsp
+; FALLBACK10-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK10-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK10-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK10-NEXT:    popq %rbx
 ; FALLBACK10-NEXT:    popq %r12
 ; FALLBACK10-NEXT:    popq %r13
 ; FALLBACK10-NEXT:    popq %r14
 ; FALLBACK10-NEXT:    popq %r15
-; FALLBACK10-NEXT:    popq %rbp
 ; FALLBACK10-NEXT:    vzeroupper
 ; FALLBACK10-NEXT:    retq
 ;
@@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK14-LABEL: ashr_64bytes:
 ; FALLBACK14:       # %bb.0:
-; FALLBACK14-NEXT:    pushq %rbp
 ; FALLBACK14-NEXT:    pushq %r15
 ; FALLBACK14-NEXT:    pushq %r14
 ; FALLBACK14-NEXT:    pushq %r13
 ; FALLBACK14-NEXT:    pushq %r12
 ; FALLBACK14-NEXT:    pushq %rbx
-; FALLBACK14-NEXT:    pushq %rax
 ; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK14-NEXT:    vmovups 32(%rdi), %xmm1
 ; FALLBACK14-NEXT:    movq 48(%rdi), %rcx
@@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT:    leal (,%rax,8), %esi
-; FALLBACK14-NEXT:    andl $56, %esi
+; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
+; FALLBACK14-NEXT:    andl $56, %ecx
+; FALLBACK14-NEXT:    movl %ecx, %esi
 ; FALLBACK14-NEXT:    andl $56, %eax
-; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rcx
-; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
-; FALLBACK14-NEXT:    shrxq %rsi, %rdi, %r12
-; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r13
-; FALLBACK14-NEXT:    shrxq %rsi, %rcx, %r9
-; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
-; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r14
-; FALLBACK14-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK14-NEXT:    movl %esi, %ebx
-; FALLBACK14-NEXT:    notb %bl
-; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
-; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; FALLBACK14-NEXT:    notb %cl
+; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %r10
+; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %r9
+; FALLBACK14-NEXT:    leaq (%r10,%r10), %rdi
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %rdi
+; FALLBACK14-NEXT:    orq %r8, %rdi
+; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %r11
+; FALLBACK14-NEXT:    shrxq %rsi, %r11, %rbx
+; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r14
+; FALLBACK14-NEXT:    leaq (%r14,%r14), %r8
+; FALLBACK14-NEXT:    shlxq %rcx, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    shrxq %rsi, %r9, %rbx
+; FALLBACK14-NEXT:    addq %r11, %r11
+; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r11
+; FALLBACK14-NEXT:    orq %rbx, %r11
+; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %rbx
+; FALLBACK14-NEXT:    shrxq %rsi, %rbx, %r15
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
-; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r13
-; FALLBACK14-NEXT:    shrxq %rsi, %rbp, %rbp
+; FALLBACK14-NEXT:    leaq (%r12,%r12), %r13
+; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r13
+; FALLBACK14-NEXT:    orq %r15, %r13
+; FALLBACK14-NEXT:    shrxq %rsi, %r14, %r14
+; FALLBACK14-NEXT:    addq %rbx, %rbx
+; FALLBACK14-NEXT:    shlxq %rcx, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r14, %rbx
+; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r14
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
-; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rsi
-; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %r9, %rdi
-; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT:    orq %r14, %r9
-; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT:    orq %r15, %r10
-; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT:    orq %r13, %rax
-; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rbx, %rcx, %rcx
-; FALLBACK14-NEXT:    orq %rbp, %rcx
-; FALLBACK14-NEXT:    movq %rsi, 56(%rdx)
+; FALLBACK14-NEXT:    leaq (%rax,%rax), %r15
+; FALLBACK14-NEXT:    shlxq %rcx, %r15, %r15
+; FALLBACK14-NEXT:    orq %r14, %r15
+; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r10
+; FALLBACK14-NEXT:    addq %r9, %r9
+; FALLBACK14-NEXT:    shlxq %rcx, %r9, %rcx
+; FALLBACK14-NEXT:    orq %r10, %rcx
+; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rax
+; FALLBACK14-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
-; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
-; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
-; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r8, (%rdx)
-; FALLBACK14-NEXT:    addq $8, %rsp
+; FALLBACK14-NEXT:    movq %r15, 48(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r13, 40(%rdx)
+; FALLBACK14-NEXT:    movq %r11, 16(%rdx)
+; FALLBACK14-NEXT:    movq %r8, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rdi, (%rdx)
 ; FALLBACK14-NEXT:    popq %rbx
 ; FALLBACK14-NEXT:    popq %r12
 ; FALLBACK14-NEXT:    popq %r13
 ; FALLBACK14-NEXT:    popq %r14
 ; FALLBACK14-NEXT:    popq %r15
-; FALLBACK14-NEXT:    popq %rbp
 ; FALLBACK14-NEXT:    vzeroupper
 ; FALLBACK14-NEXT:    retq
 ;
@@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
+; FALLBACK18-NEXT:    movl %edx, %ebx
 ; FALLBACK18-NEXT:    andl $60, %ecx
 ; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %edx, %ebx
-; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ecx, %ebp
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %esi, %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    orl %edi, %ecx
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    orl %ecx, %eax
+; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK18-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK18-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK18-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, %esi
+; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK18-NEXT:    addl %eax, %eax
-; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK18-NEXT:    addl %ebp, %ebp
-; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT:    orl %eax, %ebx
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT:    orl %ebp, %ecx
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK18-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %edi, %edx
+; FALLBACK18-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl %edx, 60(%eax)
-; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK18-NEXT:    movl %edi, 48(%eax)
-; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK18-NEXT:    movl %esi, 40(%eax)
+; FALLBACK18-NEXT:    movl %edi, 60(%eax)
+; FALLBACK18-NEXT:    movl %edx, 56(%eax)
+; FALLBACK18-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK18-NEXT:    movl %esi, 52(%eax)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
+; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    andl $60, %ecx
 ; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %edx, %ebx
-; FALLBACK22-NEXT:    notb %bl
+; FALLBACK22-NEXT:    notb %dl
 ; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %ebp
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl %ecx, %edi
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK22-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %esi
+; FALLBACK22-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %eax, %eax
-; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK22-NEXT:    addl %ebp, %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %edi, %edx
+; FALLBACK22-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl %edx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK22-NEXT:    movl %edi, 48(%eax)
-; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl %edi, 60(%eax)
+; FALLBACK22-NEXT:    movl %edx, 56(%eax)
+; FALLBACK22-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK22-NEXT:    movl %esi, 52(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl %eax, %ecx
 ; FALLBACK26-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK26-NEXT:    andl $24, %edx
+; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %edx, %ebx
-; FALLBACK26-NEXT:    notb %bl
+; FALLBACK26-NEXT:    notb %dl
 ; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %ecx, %ebp
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl %ecx, %edi
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK26-NEXT:    orl %edi, %ecx
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK26-NEXT:    orl %ecx, %eax
+; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK26-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK26-NEXT:    orl %ebp, %esi
+; FALLBACK26-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %eax, %eax
-; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK26-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK26-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK26-NEXT:    addl %ebp, %ebp
-; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK26-NEXT:    orl %eax, %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT:    orl %ebp, %ecx
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %edi, %edx
+; FALLBACK26-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT:    movl %edx, 60(%eax)
-; FALLBACK26-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK26-NEXT:    movl %edi, 48(%eax)
-; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK26-NEXT:    movl %esi, 40(%eax)
+; FALLBACK26-NEXT:    movl %edi, 60(%eax)
+; FALLBACK26-NEXT:    movl %edx, 56(%eax)
+; FALLBACK26-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK26-NEXT:    movl %esi, 52(%eax)
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl %eax, %ecx
 ; FALLBACK30-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK30-NEXT:    andl $24, %edx
+; FALLBACK30-NEXT:    movl %edx, %ebx
 ; FALLBACK30-NEXT:    andl $60, %ecx
 ; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %edx, %ebx
-; FALLBACK30-NEXT:    notb %bl
+; FALLBACK30-NEXT:    notb %dl
 ; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %eax
 ; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
 ; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %ecx, %ebp
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl %ecx, %edi
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
-; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
-; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
-; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
-; FALLBACK30-NEXT:    orl %edi, %ecx
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK30-NEXT:    orl %ecx, %eax
+; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 120(%esp,%ebp), %edi
+; FALLBACK30-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT:    movl 116(%esp,%ebp), %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %ebp
+; FALLBACK30-NEXT:    orl %ebp, %esi
+; FALLBACK30-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %eax, %eax
-; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK30-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK30-NEXT:    sarxl %edx, %ebp, %edx
-; FALLBACK30-NEXT:    addl %ebp, %ebp
-; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK30-NEXT:    orl %eax, %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT:    orl %ebp, %ecx
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl 124(%esp,%eax), %eax
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ebp, %edx
+; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %edi, %edx
+; FALLBACK30-NEXT:    sarxl %ebx, %eax, %edi
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT:    movl %edx, 60(%eax)
-; FALLBACK30-NEXT:    movl %ebx, 56(%eax)
-; FALLBACK30-NEXT:    movl %edi, 48(%eax)
-; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK30-NEXT:    movl %esi, 40(%eax)
+; FALLBACK30-NEXT:    movl %edi, 60(%eax)
+; FALLBACK30-NEXT:    movl %edx, 56(%eax)
+; FALLBACK30-NEXT:    movl %ecx, 48(%eax)
+; FALLBACK30-NEXT:    movl %esi, 52(%eax)
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 338e104fbe8f0..221a51ed44696 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %al, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
@@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movsbq %sil, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -16(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
@@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 92(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, 92(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -72(%rsp,%rsi,8), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
@@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
@@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
@@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rsi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rbx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
@@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
@@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 60(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
@@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r10, %r13, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
@@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
@@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index c3054a365c466..6b5c6049f025b 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movb %cl, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movw %cx, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-BMI2-NEXT:    andl $56, %eax
-; X64-BMI2-NEXT:    andl $56, %esi
-; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT:    movl %eax, %ecx
 ; X64-BMI2-NEXT:    notl %eax
-; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT:    addl %esi, %esi
-; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT:    andl $56, %esi
+; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT:    addl %edi, %edi
+; X64-BMI2-NEXT:    shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rcx
 ; X64-BMI2-NEXT:    orl %eax, %ecx
 ; X64-BMI2-NEXT:    movl %ecx, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
@@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
@@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
@@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
@@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, -128(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r9, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rdi, %r10, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $172, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
@@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 32(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
@@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $172, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 7735500bd3a88..bed8e5806380c 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1879,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -72(%rsp,%rcx,8), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
@@ -2055,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index 4d261a9810896..9fbbba2ed3b47 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -820,7 +820,7 @@ define void @infiniteloop() {
 ; ENABLE-NEXT:    movq %rsp, %rax
 ; ENABLE-NEXT:    addq $-16, %rax
 ; ENABLE-NEXT:    movq %rax, %rsp
-; ENABLE-NEXT:    xorl    %ecx, %ecx
+; ENABLE-NEXT:    xorl %ecx, %ecx
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  LBB10_2: ## %for.body
 ; ENABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -851,8 +851,8 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:  ## %bb.1: ## %if.then
 ; DISABLE-NEXT:    movq %rsp, %rax
 ; DISABLE-NEXT:    addq $-16, %rax
-; DISABLE-NEXT:    %rax, %rsp
-; DISABLE-NEXT:    xorl    %ecx, %ecx
+; DISABLE-NEXT:    movq %rax, %rsp
+; DISABLE-NEXT:    xorl %ecx, %ecx
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  LBB10_2: ## %for.body
 ; DISABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  LBB14_2: ## %for.body
 ; ENABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
-; ENABLE-NEXT:    cmpl %esi, %edi
-; ENABLE-NEXT:    setl %al
+; ENABLE-NEXT:    movl %esi, %eax
 ; ENABLE-NEXT:    xorl %esi, %esi
-; ENABLE-NEXT:    movb %al, %sil
+; ENABLE-NEXT:    cmpl %eax, %edi
+; ENABLE-NEXT:    setl %sil
 ; ENABLE-NEXT:    incb %dl
 ; ENABLE-NEXT:    cmpb $45, %dl
 ; ENABLE-NEXT:    jl LBB14_2
@@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  LBB14_2: ## %for.body
 ; DISABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
-; DISABLE-NEXT:    cmpl %esi, %edi
-; DISABLE-NEXT:    setl %al
+; DISABLE-NEXT:    movl %esi, %eax
 ; DISABLE-NEXT:    xorl %esi, %esi
-; DISABLE-NEXT:    movb %al, %sil
+; DISABLE-NEXT:    cmpl %eax, %edi
+; DISABLE-NEXT:    setl %sil
 ; DISABLE-NEXT:    incb %dl
 ; DISABLE-NEXT:    cmpb $45, %dl
 ; DISABLE-NEXT:    jl LBB14_2
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 2bef66825d8c0..59fbf7183abc6 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB3_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notl %ecx
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notl %esi
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
 ; X64-LIN-NEXT:    jne .LBB3_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    retq
@@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB3_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notl %ecx
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notl %edx
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
 ; X64-WIN-NEXT:    jne .LBB3_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
@@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    testw %cx, %cx
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB4_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notl %ecx
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    testw %cx, %cx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %esi, %ecx
+; X64-LIN-NEXT:    xorl %ecx, %eax
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notl %esi
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
+; X64-LIN-NEXT:    testw %si, %si
 ; X64-LIN-NEXT:    jne .LBB4_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB4_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notl %ecx
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    testw %cx, %cx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %edx, %ecx
+; X64-WIN-NEXT:    xorl %ecx, %eax
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notl %edx
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
+; X64-WIN-NEXT:    testw %dx, %dx
 ; X64-WIN-NEXT:    jne .LBB4_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB5_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorb %cl, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    notb %dl
-; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB5_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorb %sil, %al
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    notb %cl
-; X64-LIN-NEXT:    andb %sil, %cl
-; X64-LIN-NEXT:    addb %cl, %cl
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    notb %sil
+; X64-LIN-NEXT:    andb %cl, %sil
+; X64-LIN-NEXT:    addb %sil, %sil
 ; X64-LIN-NEXT:    jne .LBB5_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    # kill: def $al killed $al killed $eax
@@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB5_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorb %dl, %al
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    notb %cl
-; X64-WIN-NEXT:    andb %dl, %cl
-; X64-WIN-NEXT:    addb %cl, %cl
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    notb %dl
+; X64-WIN-NEXT:    andb %cl, %dl
+; X64-WIN-NEXT:    addb %dl, %dl
 ; X64-WIN-NEXT:    jne .LBB5_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
@@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB6_1: # %bb
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl $2147483646, %edx # imm = 0x7FFFFFFE
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2: # %bb12
 ; X86-NEXT:    retl
@@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X64-LIN-NEXT:    .p2align 4
 ; X64-LIN-NEXT:  .LBB6_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-LIN-NEXT:    movl %esi, %ecx
 ; X64-LIN-NEXT:    xorl %esi, %eax
-; X64-LIN-NEXT:    movl %eax, %ecx
-; X64-LIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
-; X64-LIN-NEXT:    andl %esi, %ecx
-; X64-LIN-NEXT:    addl %ecx, %ecx
-; X64-LIN-NEXT:    movl %ecx, %esi
+; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorl $2147483646, %esi # imm = 0x7FFFFFFE
+; X64-LIN-NEXT:    andl %ecx, %esi
+; X64-LIN-NEXT:    addl %esi, %esi
 ; X64-LIN-NEXT:    jne .LBB6_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
 ; X64-LIN-NEXT:    retq
@@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind  {
 ; X64-WIN-NEXT:    .p2align 4
 ; X64-WIN-NEXT:  .LBB6_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-WIN-NEXT:    movl %edx, %ecx
 ; X64-WIN-NEXT:    xorl %edx, %eax
-; X64-WIN-NEXT:    movl %eax, %ecx
-; X64-WIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
-; X64-WIN-NEXT:    andl %edx, %ecx
-; X64-WIN-NEXT:    addl %ecx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %edx
+; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorl $2147483646, %edx # imm = 0x7FFFFFFE
+; X64-WIN-NEXT:    andl %ecx, %edx
+; X64-WIN-NEXT:    addl %edx, %edx
 ; X64-WIN-NEXT:    jne .LBB6_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
 ; X64-WIN-NEXT:    retq
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index c12d8135e5eba..082b876b542e5 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r
 ; X32-NEXT:    .p2align 4
 ; X32-NEXT:  .LBB2_2: # %for.body
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movl (%ebx,%esi), %ebp
-; X32-NEXT:    addl (%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl (%esi,%ebx), %ebp
-; X32-NEXT:    movl %ebp, (%edx)
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    movl (%ebx,%esi), %ebx
+; X32-NEXT:    addl (%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl (%esi,%ebp), %ebx
+; X32-NEXT:    movl %ebx, (%edx)
+; X32-NEXT:    leal (%ebp,%esi), %ebx
 ; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    decl %eax

From 2fc2e1f95156f86dead18f56233965f774fb551f Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 10 Nov 2025 12:33:28 -0800
Subject: [PATCH 04/97] [LLDB] Fix darwin shell tests under ASAN

---
 lldb/test/Shell/helper/toolchain.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 728f6347242f1..faa29d23387cc 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -250,6 +250,15 @@ def use_support_substitutions(config):
             "-L{}".format(config.libcxx_libs_dir),
             "-lc++",
         ]
+    # By default, macOS doesn't allow injecting the ASAN runtime into system processes.
+    if platform.system() in ["Darwin"] and config.llvm_use_sanitizer:
+        system_clang = (
+            subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8")
+        )
+        system_liblto = os.path.join(
+            os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib"
+        )
+        host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto]
 
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))

From f8e9723d17187d2b5ba867919ca2f5158fc28659 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 12:49:01 -0800
Subject: [PATCH 05/97] ARM: Enable terminal rule (#165958)

---
 llvm/lib/Target/ARM/ARMSubtarget.h            |   1 +
 .../Thumb2/LowOverheadLoops/constbound.ll     |  18 +-
 .../Thumb2/LowOverheadLoops/minloop.ll        |  24 +-
 .../varying-outer-2d-reduction.ll             |  34 ++-
 .../Thumb2/LowOverheadLoops/while-loops.ll    |  91 ++++----
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 211 +++++++++---------
 .../CodeGen/Thumb2/mve-gather-increment.ll    |  24 +-
 .../Thumb2/mve-gather-scatter-optimisation.ll |  90 ++++----
 llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll |  52 ++---
 llvm/test/CodeGen/Thumb2/mve-shuffle.ll       |  13 +-
 llvm/test/CodeGen/Thumb2/mve-vld4.ll          |  13 +-
 .../CodeGen/Thumb2/mve-vmaxnma-commute.ll     |  24 +-
 llvm/test/CodeGen/Thumb2/mve-vst4.ll          |  14 +-
 llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll      |   2 +-
 14 files changed, 302 insertions(+), 309 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 4a0883cc662e7..34baa3108402c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -377,6 +377,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isRWPI() const;
 
   bool useMachineScheduler() const { return UseMISched; }
+  bool enableTerminalRule() const override { return true; }
   bool useMachinePipeliner() const { return UseMIPipeliner; }
   bool hasMinSize() const { return OptMinSize; }
   bool isThumb1Only() const { return isThumb() && !hasThumb2(); }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
index 79665af17ef58..9632469261f4d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
@@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) {
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    mov.w lr, #126
-; CHECK-NEXT:    adr r2, .LCPI0_0
-; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    mov.w r2, #500
-; CHECK-NEXT:    vdup.32 q1, r2
-; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    adr r1, .LCPI0_0
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    mov.w r1, #500
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    vdup.32 q1, r1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vqadd.u32 q2, q0, r1
-; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    vqadd.u32 q2, q0, r2
+; CHECK-NEXT:    adds r2, #4
 ; CHECK-NEXT:    vptt.u32 hi, q1, q2
 ; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT:    vaddvat.u32 r2, q2
+; CHECK-NEXT:    vaddvat.u32 r12, q2
 ; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.3:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
index ec257bcf123f3..bcedcd40ba112 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll
@@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap
 ; CHECK-NEXT:    str r6, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r7, #4
 ; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov.w r8, #0
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
 ; CHECK-NEXT:  .LBB0_5: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r11, [r0, #16]!
-; CHECK-NEXT:    ldrd r5, r7, [r0, #-12]
+; CHECK-NEXT:    ldrd r5, r6, [r0, #-12]
 ; CHECK-NEXT:    ldr r4, [r0, #-4]
 ; CHECK-NEXT:    cmp r12, r5
 ; CHECK-NEXT:    csel r5, r5, r12, gt
-; CHECK-NEXT:    csinc r6, r10, r8, le
-; CHECK-NEXT:    cmp r5, r7
+; CHECK-NEXT:    csinc r7, r10, r8, le
+; CHECK-NEXT:    cmp r5, r6
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    addgt.w r6, r8, #2
-; CHECK-NEXT:    csel r7, r7, r5, gt
-; CHECK-NEXT:    cmp r7, r4
+; CHECK-NEXT:    addgt.w r7, r8, #2
+; CHECK-NEXT:    csel r6, r6, r5, gt
+; CHECK-NEXT:    cmp r6, r4
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    addgt.w r6, r8, #3
-; CHECK-NEXT:    csel r7, r4, r7, gt
+; CHECK-NEXT:    addgt.w r7, r8, #3
+; CHECK-NEXT:    csel r6, r4, r6, gt
 ; CHECK-NEXT:    add.w r8, r8, #4
-; CHECK-NEXT:    cmp r7, r11
-; CHECK-NEXT:    csel r10, r8, r6, gt
-; CHECK-NEXT:    csel r12, r11, r7, gt
+; CHECK-NEXT:    cmp r6, r11
+; CHECK-NEXT:    csel r10, r8, r7, gt
+; CHECK-NEXT:    csel r12, r11, r6, gt
 ; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit
 ; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1769c5d2fd385..98e082be4cad1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    it lt
 ; ENABLED-NEXT:    bxlt lr
 ; ENABLED-NEXT:  .LBB0_1: @ %for.body.lr.ph
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ENABLED-NEXT:    mov r11, r0
-; ENABLED-NEXT:    ldr r0, [sp, #32]
+; ENABLED-NEXT:    ldr r0, [sp, #36]
 ; ENABLED-NEXT:    add.w r9, r2, #3
 ; ENABLED-NEXT:    mov.w r12, #0
+; ENABLED-NEXT:    mov.w r8, #1
 ; ENABLED-NEXT:    mov r10, r11
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
@@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
-; ENABLED-NEXT:    movs r7, #1
-; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    sub.w r4, r2, r12
+; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
-; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
+; ENABLED-NEXT:    mov r7, r10
+; ENABLED-NEXT:    add.w r6, r8, r0, lsr #2
 ; ENABLED-NEXT:    adds r0, r2, #3
 ; ENABLED-NEXT:    sub.w r0, r0, r12
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
-; ENABLED-NEXT:    mov r7, r10
-; ENABLED-NEXT:    dls lr, r0
+; ENABLED-NEXT:    add.w lr, r8, r0, lsr #2
 ; ENABLED-NEXT:    mov r0, r11
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
@@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
 ; ENABLED-NEXT:  .LBB0_8:
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ENABLED-NEXT:    bx lr
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
@@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    it lt
 ; NOREDUCTIONS-NEXT:    bxlt lr
 ; NOREDUCTIONS-NEXT:  .LBB0_1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; NOREDUCTIONS-NEXT:    mov r11, r0
-; NOREDUCTIONS-NEXT:    ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
 ; NOREDUCTIONS-NEXT:    add.w r9, r2, #3
 ; NOREDUCTIONS-NEXT:    mov.w r12, #0
+; NOREDUCTIONS-NEXT:    mov.w r8, #1
 ; NOREDUCTIONS-NEXT:    mov r10, r11
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
@@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
-; NOREDUCTIONS-NEXT:    movs r7, #1
-; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    mov r7, r10
+; NOREDUCTIONS-NEXT:    add.w r6, r8, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    adds r0, r2, #3
 ; NOREDUCTIONS-NEXT:    sub.w r0, r0, r12
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    mov r7, r10
-; NOREDUCTIONS-NEXT:    dls lr, r0
+; NOREDUCTIONS-NEXT:    add.w lr, r8, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r0, r11
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
@@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
 ; NOREDUCTIONS-NEXT:  .LBB0_8:
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; NOREDUCTIONS-NEXT:    bx lr
 entry:
   %conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index cbcbf1f392ce8..435acc29f076e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    wls lr, r1, .LBB2_3
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT:    adds r6, r3, #4
-; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    add.w r9, r3, #4
+; CHECK-NEXT:    add.w r10, r0, #4
 ; CHECK-NEXT:    mvn r8, #1
-; CHECK-NEXT:    @ implicit-def: $r9
+; CHECK-NEXT:    @ implicit-def: $r6
 ; CHECK-NEXT:    @ implicit-def: $r4
 ; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r1, [r10]
 ; CHECK-NEXT:    asrs r2, r4, #31
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    muls r1, r3, r1
 ; CHECK-NEXT:    adds r4, r4, r1
 ; CHECK-NEXT:    adc.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    adds.w r2, r4, #-2147483648
-; CHECK-NEXT:    ldrd r2, r4, [r8]
-; CHECK-NEXT:    adc r5, r1, #0
-; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    smull r4, r2, r4, r9
-; CHECK-NEXT:    asrs r1, r5, #31
+; CHECK-NEXT:    ldrd r5, r4, [r8]
+; CHECK-NEXT:    adc r2, r1, #0
 ; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    subs r4, r5, r4
-; CHECK-NEXT:    sbcs r1, r2
-; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds.w r10, r4, #-2147483648
-; CHECK-NEXT:    adc r1, r1, #0
-; CHECK-NEXT:    ldr r4, [r2, #-4]
+; CHECK-NEXT:    smull r4, r5, r4, r6
+; CHECK-NEXT:    asrs r1, r2, #31
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    subs r4, r2, r4
+; CHECK-NEXT:    sbcs r1, r5
+; CHECK-NEXT:    adds.w r6, r4, #-2147483648
+; CHECK-NEXT:    ldr r4, [r10, #-4]
+; CHECK-NEXT:    adc r11, r1, #0
+; CHECK-NEXT:    mov r1, r9
+; CHECK-NEXT:    add.w r10, r10, #4
 ; CHECK-NEXT:    muls r4, r3, r4
 ; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    adds.w r12, r4, #-2147483648
 ; CHECK-NEXT:    asr.w r5, r4, #31
-; CHECK-NEXT:    ldr r4, [r6]
+; CHECK-NEXT:    ldr.w r4, [r9]
 ; CHECK-NEXT:    adc r5, r5, #0
 ; CHECK-NEXT:    mul r2, r4, r0
-; CHECK-NEXT:    adds r0, #4
 ; CHECK-NEXT:    add.w r2, r2, #-2147483648
 ; CHECK-NEXT:    asrl r12, r5, r2
-; CHECK-NEXT:    smull r2, r5, r4, r12
-; CHECK-NEXT:    lsll r2, r5, #30
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    asr.w r11, r5, #31
-; CHECK-NEXT:    mov r12, r5
-; CHECK-NEXT:    lsll r12, r11, r4
-; CHECK-NEXT:    mul r2, r2, r9
-; CHECK-NEXT:    lsrl r12, r11, #2
-; CHECK-NEXT:    adds r2, #2
-; CHECK-NEXT:    lsll r12, r11, r2
+; CHECK-NEXT:    smull r2, r9, r4, r12
+; CHECK-NEXT:    mov r12, r0
+; CHECK-NEXT:    lsll r2, r9, #30
+; CHECK-NEXT:    asr.w r5, r9, #31
+; CHECK-NEXT:    mov r2, r9
+; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    lsll r2, r5, r4
+; CHECK-NEXT:    lsrl r2, r5, #2
+; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    lsll r2, r5, r0
+; CHECK-NEXT:    add.w r0, r2, #-2147483648
 ; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r12, #-2147483648
-; CHECK-NEXT:    asrl r10, r1, r5
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    lsrl r10, r1, #2
-; CHECK-NEXT:    movs r1, #2
-; CHECK-NEXT:    mov r9, r10
-; CHECK-NEXT:    str.w r10, [r1]
-; CHECK-NEXT:    ldr r1, [r8], #-4
-; CHECK-NEXT:    mls r5, r1, r4, r5
-; CHECK-NEXT:    adds.w r4, r5, #-2147483648
-; CHECK-NEXT:    asr.w r1, r5, #31
+; CHECK-NEXT:    asrl r6, r11, r0
+; CHECK-NEXT:    movs r0, #2
+; CHECK-NEXT:    lsrl r6, r11, #2
+; CHECK-NEXT:    str r6, [r0]
+; CHECK-NEXT:    ldr r0, [r8], #-4
+; CHECK-NEXT:    mls r0, r0, r4, r1
+; CHECK-NEXT:    adds.w r4, r0, #-2147483648
+; CHECK-NEXT:    asr.w r1, r0, #31
 ; CHECK-NEXT:    adc r1, r1, #0
 ; CHECK-NEXT:    lsrl r4, r1, #2
-; CHECK-NEXT:    rsbs r1, r4, #0
-; CHECK-NEXT:    str r1, [r2]
-; CHECK-NEXT:    str r1, [r6, #-4]
-; CHECK-NEXT:    adds r6, #4
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    rsbs r0, r4, #0
+; CHECK-NEXT:    str r0, [r2]
+; CHECK-NEXT:    str r0, [r9, #-4]
+; CHECK-NEXT:    add.w r9, r9, #4
+; CHECK-NEXT:    add.w r0, r12, #4
 ; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: @ %while.end
 ; CHECK-NEXT:    add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index f7b4548f127bf..b6657d607ce6d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    ldrd r7, r9, [r0]
-; CHECK-NEXT:    and r6, r3, #3
-; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    lsrs r3, r3, #2
-; CHECK-NEXT:    @ implicit-def: $r12
-; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    ldm.w r0, {r7, r9, r11}
+; CHECK-NEXT:    and r0, r3, #3
+; CHECK-NEXT:    @ implicit-def: $r5
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    lsrs r0, r3, #2
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r2, r5
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r8, r3
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r12, r10
 ; CHECK-NEXT:  .LBB19_2: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    strd r2, r4, [r9]
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs r7, #1
-; CHECK-NEXT:    strd r3, r8, [r9, #8]
-; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    add.w r11, r11, #128
+; CHECK-NEXT:    strd r8, r0, [r9]
 ; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    strd r3, r12, [r9, #8]
+; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    subs r7, #1
 ; CHECK-NEXT:    beq.w .LBB19_13
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    ldrd r5, r11, [r9]
+; CHECK-NEXT:    ldr.w r10, [r9, #12]
 ; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
-; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldm.w r9, {r3, r4, r12}
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    wls lr, r2, .LBB19_6
+; CHECK-NEXT:    wls lr, r0, .LBB19_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:  .LBB19_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r5, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    ldm.w r1, {r2, r7, r11}
-; CHECK-NEXT:    vmul.f32 q2, q2, r5
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vfma.f32 q2, q6, r11
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r8, r4
+; CHECK-NEXT:    ldrd r4, r3, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q2, [r11]
+; CHECK-NEXT:    vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT:    ldrd r0, r7, [r1]
+; CHECK-NEXT:    vmul.f32 q2, q2, r3
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT:    vfma.f32 q2, q6, r4
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #48]
 ; CHECK-NEXT:    vfma.f32 q2, q7, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q2, q4, r2
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vfma.f32 q2, q5, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT:    vfma.f32 q2, q3, r4
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
-; CHECK-NEXT:    vfma.f32 q2, q1, r8
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT:    vfma.f32 q2, q4, r0
+; CHECK-NEXT:    vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT:    vfma.f32 q2, q5, r5
+; CHECK-NEXT:    vldrw.u32 q1, [r11, #96]
+; CHECK-NEXT:    vfma.f32 q2, q3, r8
+; CHECK-NEXT:    vldrw.u32 q0, [r11, #112]
+; CHECK-NEXT:    vfma.f32 q2, q1, r12
 ; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    vfma.f32 q2, q0, r10
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    vmov r10, r8, d5
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    vmov r10, r12, d5
 ; CHECK-NEXT:    vstrb.8 q2, [r6], #16
-; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:    mov r12, r5
 ; CHECK-NEXT:    le lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    cmp r7, #0
 ; CHECK-NEXT:    beq .LBB19_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldrd lr, r4, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r2, r1, [r1, #8]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    ldrd lr, r0, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r11]
+; CHECK-NEXT:    ldrd r8, r1, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q6, [r11, #16]
+; CHECK-NEXT:    vldrw.u32 q7, [r11, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r11, #48]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r1
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q0, q6, r2
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vfma.f32 q0, q7, r4
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r11, #64]
+; CHECK-NEXT:    vfma.f32 q0, q6, r8
+; CHECK-NEXT:    vldrw.u32 q3, [r11, #80]
+; CHECK-NEXT:    vfma.f32 q0, q7, r0
+; CHECK-NEXT:    vldrw.u32 q2, [r11, #96]
 ; CHECK-NEXT:    vfma.f32 q0, q4, lr
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
-; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    cmp r3, #1
-; CHECK-NEXT:    vfma.f32 q0, q3, r11
-; CHECK-NEXT:    vfma.f32 q0, q2, r8
+; CHECK-NEXT:    vldrw.u32 q1, [r11, #112]
+; CHECK-NEXT:    vfma.f32 q0, q5, r3
+; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    vfma.f32 q0, q3, r4
+; CHECK-NEXT:    vfma.f32 q0, q2, r12
 ; CHECK-NEXT:    vfma.f32 q0, q1, r10
-; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    bne .LBB19_9
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    str r5, [r6]
-; CHECK-NEXT:    mov r2, lr
-; CHECK-NEXT:    mov r4, r12
-; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    str r4, [r6]
+; CHECK-NEXT:    mov r8, lr
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r3, r4
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov r8, s1
-; CHECK-NEXT:    cmp r3, #2
+; CHECK-NEXT:    vmov r12, s1
+; CHECK-NEXT:    cmp r7, #2
 ; CHECK-NEXT:    vstr s1, [r6, #4]
-; CHECK-NEXT:    str r5, [r6]
+; CHECK-NEXT:    str r4, [r6]
 ; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r2, r4
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r4, lr
-; CHECK-NEXT:    mov r8, r5
+; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    mov r0, lr
+; CHECK-NEXT:    mov r12, r4
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_11: @ %if.else64
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
@@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vstr s2, [r6, #8]
 ; CHECK-NEXT:  .LBB19_12: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    b .LBB19_2
 ; CHECK-NEXT:  .LBB19_13: @ %do.end
 ; CHECK-NEXT:    add sp, #16
@@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    ldrd r6, r12, [r0, #4]
 ; CHECK-NEXT:    lsr.w r8, r3, #1
 ; CHECK-NEXT:    ldrb r0, [r0]
@@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:    b .LBB20_3
 ; CHECK-NEXT:  .LBB20_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    vstr s12, [r6]
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vstr s4, [r6]
 ; CHECK-NEXT:  .LBB20_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vstr s14, [r6, #4]
+; CHECK-NEXT:    vstr s6, [r6, #4]
 ; CHECK-NEXT:    add.w r12, r12, #20
 ; CHECK-NEXT:    adds r6, #8
 ; CHECK-NEXT:    subs r0, #1
@@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:  .LBB20_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB20_5 Depth 2
-; CHECK-NEXT:    vldrw.u32 q2, [r12]
+; CHECK-NEXT:    vldrw.u32 q3, [r12]
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vshlc q4, r5, #32
-; CHECK-NEXT:    vldrw.u32 q1, [r12, #8]
-; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldrw.u32 q2, [r12, #8]
+; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vshlc q5, r5, #32
-; CHECK-NEXT:    vldrw.u32 q3, [r6]
-; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    mov r5, r2
-; CHECK-NEXT:    vmov.f32 s15, s0
+; CHECK-NEXT:    vmov.f32 s7, s0
 ; CHECK-NEXT:    wls lr, r8, .LBB20_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:  .LBB20_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB20_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrd r7, r4, [r1], #8
-; CHECK-NEXT:    vfma.f32 q6, q2, r7
-; CHECK-NEXT:    vmov r7, s24
-; CHECK-NEXT:    vmov q3, q6
-; CHECK-NEXT:    vfma.f32 q3, q1, r7
-; CHECK-NEXT:    vstr s24, [r5]
-; CHECK-NEXT:    vmov.f32 s15, s0
-; CHECK-NEXT:    vfma.f32 q3, q4, r4
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    vstr s13, [r5, #4]
-; CHECK-NEXT:    vfma.f32 q3, q5, r4
+; CHECK-NEXT:    vfma.f32 q1, q3, r7
+; CHECK-NEXT:    vmov r7, s4
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vfma.f32 q1, q2, r7
+; CHECK-NEXT:    vmov.f32 s7, s0
+; CHECK-NEXT:    vfma.f32 q1, q4, r4
+; CHECK-NEXT:    vmov r4, s5
+; CHECK-NEXT:    vstr s5, [r5, #4]
+; CHECK-NEXT:    vfma.f32 q1, q5, r4
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vstr s2, [r5]
 ; CHECK-NEXT:    adds r5, #8
-; CHECK-NEXT:    vmov.f32 s12, s14
-; CHECK-NEXT:    vmov.f32 s13, s15
-; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    le lr, .LBB20_5
 ; CHECK-NEXT:  .LBB20_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
@@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
 ; CHECK-NEXT:    ldr r1, [r1]
-; CHECK-NEXT:    vfma.f32 q3, q2, r1
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vstr s12, [r5]
-; CHECK-NEXT:    vfma.f32 q3, q1, r1
-; CHECK-NEXT:    vstr s13, [r6]
+; CHECK-NEXT:    vfma.f32 q1, q3, r1
+; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vstr s4, [r5]
+; CHECK-NEXT:    vfma.f32 q1, q2, r1
+; CHECK-NEXT:    vstr s5, [r6]
 ; CHECK-NEXT:    b .LBB20_2
 ; CHECK-NEXT:  .LBB20_8: @ %do.end
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.9:
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 0d86f22a321e0..b60ee7c6d406b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    ldr.w r8, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:  .LBB16_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB16_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q1, q5, r0
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vadd.i32 q6, q5, r0
+; CHECK-NEXT:    vmov r7, r3, d13
 ; CHECK-NEXT:    vadd.i32 q2, q4, r0
-; CHECK-NEXT:    vmov r7, r3, d3
-; CHECK-NEXT:    vadd.i32 q6, q0, lr
 ; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vmov q1, q7
+; CHECK-NEXT:    vmov r4, r10, d12
+; CHECK-NEXT:    vadd.i32 q6, q0, lr
 ; CHECK-NEXT:    subs.w r9, r9, #16
-; CHECK-NEXT:    vmov r4, r10, d2
-; CHECK-NEXT:    vadd.i32 q1, q7, lr
 ; CHECK-NEXT:    vadd.i32 q4, q4, lr
 ; CHECK-NEXT:    vadd.i32 q5, q5, lr
+; CHECK-NEXT:    vadd.i32 q7, q7, lr
 ; CHECK-NEXT:    ldrb.w r11, [r3]
 ; CHECK-NEXT:    ldrb r3, [r7]
 ; CHECK-NEXT:    vmov r7, r12, d4
-; CHECK-NEXT:    vadd.i32 q2, q7, r0
-; CHECK-NEXT:    vadd.i32 q7, q0, r0
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r4, [r4]
@@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    ldrb.w r1, [r12]
 ; CHECK-NEXT:    vmov.8 q0[0], r7
 ; CHECK-NEXT:    vmov.8 q0[1], r1
-; CHECK-NEXT:    vmov r1, r7, d15
+; CHECK-NEXT:    vmov r1, r7, d3
 ; CHECK-NEXT:    vmov.8 q0[2], r5
 ; CHECK-NEXT:    vmov.8 q0[3], r6
 ; CHECK-NEXT:    vmov.8 q0[4], r4
@@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    ldrb r3, [r5]
 ; CHECK-NEXT:    ldrb.w r12, [r7]
 ; CHECK-NEXT:    ldrb r5, [r4]
-; CHECK-NEXT:    vmov r4, r7, d14
-; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    vmov r4, r7, d2
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    vmov.8 q0[8], r4
@@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
 ; CHECK-NEXT:    vmov.8 q0[14], r3
 ; CHECK-NEXT:    vmov.8 q0[15], r12
 ; CHECK-NEXT:    vstrb.8 q0, [r8], #16
-; CHECK-NEXT:    vmov q0, q6
 ; CHECK-NEXT:    bne .LBB16_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB16_2 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index eedca2cd4a5d3..c0b2da7eff41b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vadd.i32 q3, q1, q0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, q1, uxtw #2]
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vstrw.32 q2, [r0, q1, uxtw #2]
-; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
+; CHECK-NEXT:    vstrw.32 q3, [r0, q2, uxtw #2]
 ; CHECK-NEXT:    bne .LBB5_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
 ; CHECK-NEXT:    bx lr
@@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    adr r4, .LCPI7_0
 ; CHECK-NEXT:    mov.w r12, #9
-; CHECK-NEXT:    vldrw.u32 q1, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    mov.w lr, #12
 ; CHECK-NEXT:    movs r4, #8
-; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vdup.32 q1, r0
 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vadd.i32 q2, q1, r4
-; CHECK-NEXT:    vmla.i32 q3, q1, lr
-; CHECK-NEXT:    vmul.i32 q1, q1, r12
-; CHECK-NEXT:    vldrw.u32 q4, [q3, #24]
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmla.i32 q3, q2, lr
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vstrw.32 q1, [r3]
-; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vldrw.u32 q4, [q3, #24]
+; CHECK-NEXT:    vmul.i32 q2, q2, r12
+; CHECK-NEXT:    vadd.i32 q0, q0, r4
+; CHECK-NEXT:    vstrw.32 q2, [r3]
 ; CHECK-NEXT:    vstrb.8 q4, [r1], #16
 ; CHECK-NEXT:    bne .LBB7_1
 ; CHECK-NEXT:  @ %bb.2: @ %end
@@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    adr r4, .LCPI8_0
 ; CHECK-NEXT:    movs r5, #18
-; CHECK-NEXT:    vldrw.u32 q2, [r4]
+; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    mov.w r12, #9
 ; CHECK-NEXT:    mov.w lr, #12
 ; CHECK-NEXT:    movs r4, #8
-; CHECK-NEXT:    vdup.32 q0, r0
-; CHECK-NEXT:    vdup.32 q1, r5
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vdup.32 q2, r5
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vadd.i32 q3, q2, r4
-; CHECK-NEXT:    vmla.i32 q4, q2, lr
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmla.i32 q4, q3, lr
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vldrw.u32 q5, [q4, #24]
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmla.i32 q4, q2, r12
-; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmla.i32 q4, q3, r12
+; CHECK-NEXT:    vadd.i32 q0, q0, r4
 ; CHECK-NEXT:    vstrb.8 q5, [r1], #16
 ; CHECK-NEXT:    vstrw.32 q4, [r3]
 ; CHECK-NEXT:    bne .LBB8_1
@@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov q7, q2
+; CHECK-NEXT:    vmov q1, q2
 ; CHECK-NEXT:    dls lr, r10
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
-; CHECK-NEXT:    vmlas.i32 q7, q0, r7
-; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vmlas.i32 q1, q0, r7
+; CHECK-NEXT:    vmov q7, q4
 ; CHECK-NEXT:  .LBB9_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vadd.i32 q0, q7, q3
-; CHECK-NEXT:    vldrw.u32 q1, [r1, q7, uxtw #2]
-; CHECK-NEXT:    vldrw.u32 q7, [q6, #32]!
-; CHECK-NEXT:    vmul.i32 q1, q1, q7
-; CHECK-NEXT:    vmov q7, q0
-; CHECK-NEXT:    vadd.i32 q5, q1, q5
+; CHECK-NEXT:    vmov q6, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q3
+; CHECK-NEXT:    vldrw.u32 q0, [r1, q6, uxtw #2]
+; CHECK-NEXT:    vldrw.u32 q6, [q7, #32]!
+; CHECK-NEXT:    vmul.i32 q0, q0, q6
+; CHECK-NEXT:    vadd.i32 q5, q0, q5
 ; CHECK-NEXT:    le lr, .LBB9_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=2
@@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    vadd.i32 q6, q5, q3
-; CHECK-NEXT:    vldrh.s32 q7, [r1, q5, uxtw #1]
-; CHECK-NEXT:    vldrh.s32 q5, [r3], #8
-; CHECK-NEXT:    vmul.i32 q5, q7, q5
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov q5, q6
+; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vadd.i32 q5, q5, q3
+; CHECK-NEXT:    vldrh.s32 q7, [r1, q6, uxtw #1]
+; CHECK-NEXT:    vldrh.s32 q6, [r3], #8
+; CHECK-NEXT:    vmul.i32 q6, q7, q6
+; CHECK-NEXT:    vadd.i32 q4, q6, q4
 ; CHECK-NEXT:    le lr, .LBB10_11
 ; CHECK-NEXT:  @ %bb.12: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
@@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly
 ; CHECK-NEXT:    @ Parent Loop BB11_3 Depth=3
 ; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=4
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=5
-; CHECK-NEXT:    vldrb.s32 q2, [r0, q5]
-; CHECK-NEXT:    vadd.i32 q7, q5, q0
-; CHECK-NEXT:    vldrb.s32 q5, [r1, q4]
-; CHECK-NEXT:    vadd.i32 q6, q4, q0
-; CHECK-NEXT:    vadd.i32 q2, q2, r2
+; CHECK-NEXT:    vmov q7, q5
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrb.s32 q2, [r0, q7]
+; CHECK-NEXT:    vldrb.s32 q7, [r1, q6]
 ; CHECK-NEXT:    subs r5, #4
-; CHECK-NEXT:    vmlava.u32 r12, q2, q5
-; CHECK-NEXT:    vmov q5, q7
-; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vadd.i32 q4, q4, q0
+; CHECK-NEXT:    vadd.i32 q2, q2, r2
+; CHECK-NEXT:    vadd.i32 q5, q5, q0
+; CHECK-NEXT:    vmlava.u32 r12, q2, q7
 ; CHECK-NEXT:    bne .LBB11_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=4
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index 43ed5eefbf4c7..d6c5cde30ed73 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
 ; CHECK-NEXT:    csel r7, r6, r5, hs
 ; CHECK-NEXT:    add.w lr, r7, #1
 ; CHECK-NEXT:    mov r4, r5
-; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r8, r5
+; CHECK-NEXT:    vldrh.u16 q2, [r1], #32
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q2
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q2
+; CHECK-NEXT:    vldrh.u16 q2, [r1, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q2
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #32
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q1
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q1
-; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
-; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
 ; CHECK-NEXT:    sub.w lr, lr, #1
 ; CHECK-NEXT:    cmp.w lr, #0
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
+; CHECK-NEXT:    vldrh.u16 q3, [r0], #32
 ; CHECK-NEXT:    beq .LBB0_3
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB0_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
-; CHECK-NEXT:    vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #32
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q3
-; CHECK-NEXT:    vldrh.u16 q0, [r1], #32
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q2
+; CHECK-NEXT:    vldrh.u16 q2, [r1, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q3, q1
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q3, q1
+; CHECK-NEXT:    vldrh.u16 q3, [r0], #32
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q2
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #32
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q2
 ; CHECK-NEXT:    movs r6, #14
 ; CHECK-NEXT:    and.w r2, r6, r2, lsl #1
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q1, q0
-; CHECK-NEXT:    vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vldrh.u16 q0, [r1, #-16]
-; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q2, q0
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q3, q1
+; CHECK-NEXT:    vldrh.u16 q0, [r0, #-16]
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q3, q1
+; CHECK-NEXT:    vldrh.u16 q1, [r1, #-16]
+; CHECK-NEXT:    vmlaldavax.s16 r8, r5, q0, q1
 ; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    vmlsldava.s16 r4, r7, q2, q0
+; CHECK-NEXT:    vmlsldava.s16 r4, r7, q0, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrht.u16 q1, [r0]
+; CHECK-NEXT:    vldrht.u16 q2, [r0]
 ; CHECK-NEXT:    cmp r2, #9
 ; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrht.u16 q0, [r1]
-; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q1, q0
-; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q1, q0
+; CHECK-NEXT:    vmlsldavat.s16 r4, r7, q2, q0
+; CHECK-NEXT:    vmlaldavaxt.s16 r8, r5, q2, q0
 ; CHECK-NEXT:    blo .LBB0_10
 ; CHECK-NEXT:  @ %bb.4: @ %do.body.1
 ; CHECK-NEXT:    subs r2, #8
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 94d5490cead2f..6f2a0b2debc47 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
 ; CHECK-NEXT:    vmovx.f16 s1, s14
 ; CHECK-NEXT:    vmovx.f16 s20, s0
 ; CHECK-NEXT:    vins.f16 s23, s1
-; CHECK-NEXT:    vmovx.f16 s1, s2
-; CHECK-NEXT:    vins.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmovx.f16 s21, s4
-; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s2, s6
 ; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s21, s1
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s21, s2
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index ab41069bfa258..ecb169898f9f0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmovx.f16 s1, s2
 ; CHECK-NEXT:    vmovx.f16 s20, s8
 ; CHECK-NEXT:    vins.f16 s23, s1
-; CHECK-NEXT:    vmovx.f16 s1, s10
-; CHECK-NEXT:    vins.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmovx.f16 s10, s10
 ; CHECK-NEXT:    vmovx.f16 s21, s12
-; CHECK-NEXT:    vmovx.f16 s1, s14
+; CHECK-NEXT:    vins.f16 s20, s10
+; CHECK-NEXT:    vmovx.f16 s10, s14
 ; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vins.f16 s21, s1
-; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vins.f16 s21, s10
 ; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vins.f16 s8, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.i16 q0, q2, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
index 04be18e3dd873..6656d44eec81e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll
@@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r1
 ; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vmaxnma.f32 q1, q0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vmaxnma.f32 q0, q1
 ; CHECK-NEXT:    letp lr, .LBB19_1
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    vldr s0, .LCPI19_0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmaxnmav.f32 r0, q1
+; CHECK-NEXT:    vldr s4, .LCPI19_0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vstr s0, [r2]
 ; CHECK-NEXT:    pop {r7, pc}
@@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.16 lr, r1
 ; CHECK-NEXT:  .LBB23_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
-; CHECK-NEXT:    vmaxnma.f16 q1, q0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #8
+; CHECK-NEXT:    vmaxnma.f16 q0, q1
 ; CHECK-NEXT:    letp lr, .LBB23_1
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmaxnmav.f16 r0, q1
+; CHECK-NEXT:    vldr.16 s4, .LCPI23_0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 ; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 26ab555c2c593..fb5f543fd0d3a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vins.f16 s12, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s3
 ; CHECK-NEXT:    vins.f16 s11, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vins.f16 s2, s6
-; CHECK-NEXT:    vmovx.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vmovx.f16 s2, s7
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vins.f16 s10, s2
 ; CHECK-NEXT:    vmov.f32 s9, s1
 ; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s6, s2
 ; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
index e6fcf56af6e8d..2929a04cc0637 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll
@@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 {
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    sub.w r3, r4, #16
 ; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
-; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:  .LBB0_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r5, [r3, #16]!

From 2aa629da6a611be3f8aec9e1dd6d0a7f5f8f6a23 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 12:49:21 -0800
Subject: [PATCH 06/97] AArch64: Enable terminal rule (#165959)

---
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |  2 +-
 .../CodeGen/AArch64/build-vector-two-dup.ll   | 10 +-
 .../AArch64/machine-licm-sink-instr.ll        | 39 +++-----
 .../AArch64/machine-sink-kill-flags.ll        |  5 +-
 ...ate-sm-changing-call-disable-coalescing.ll | 85 +++++++----------
 .../sme-streaming-compatible-interface.ll     |  5 +-
 .../sve-extract-fixed-from-scalable-vector.ll | 12 +--
 .../AArch64/sve-extract-fixed-vector.ll       | 39 ++++----
 .../AArch64/sve-fixed-length-reshuffle.ll     | 12 +--
 .../AArch64/sve-fixed-length-shuffles.ll      | 72 +++++++--------
 .../CodeGen/AArch64/sve-ptest-removal-sink.ll |  8 +-
 llvm/test/CodeGen/AArch64/zext-to-tbl.ll      | 92 +++++++++----------
 12 files changed, 177 insertions(+), 204 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 8974965c41fe3..ab4004e30f629 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -157,7 +157,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
   bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
-
+  bool enableTerminalRule() const override { return true; }
   bool enableMachinePipeliner() const override;
   bool useDFAforSMS() const override { return false; }
 
diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index dbbfbea9176f6..f725c19081deb 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -188,11 +188,11 @@ entry:
 define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ld1r { v1.8b }, [x0]
-; CHECK-NEXT:    ld1r { v2.8b }, [x1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ld1r { v1.8b }, [x1]
+; CHECK-NEXT:    fmov d2, d0
+; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    mov v0.h[3], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
index 3230c9e946da7..b3a7ec961b736 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
@@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    adrp x8, A
-; CHECK-NEXT:    mov w20, w19
-; CHECK-NEXT:    ldr w21, [x8, :lo12:A]
+; CHECK-NEXT:    mov w21, w19
+; CHECK-NEXT:    ldr w20, [x8, :lo12:A]
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov w0, w21
+; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w20, w20, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB0_2
-; CHECK-NEXT:    b .LBB0_4
-; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    mov w20, w19
-; CHECK-NEXT:  .LBB0_4: // %for.cond.cleanup
-; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:  .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) {
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w21, w21, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB1_2
-; CHECK-NEXT:    b .LBB1_4
-; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    mov w21, w19
-; CHECK-NEXT:  .LBB1_4: // %for.cond.cleanup
+; CHECK-NEXT:  .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
@@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) {
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w21, w21, w0
-; CHECK-NEXT:    subs w19, w19, #1
+; CHECK-NEXT:    sdiv w19, w19, w0
+; CHECK-NEXT:    subs w21, w21, #1
 ; CHECK-NEXT:    b.ne .LBB2_2
-; CHECK-NEXT:    b .LBB2_4
-; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    mov w21, w19
-; CHECK-NEXT:  .LBB2_4: // %for.cond.cleanup
+; CHECK-NEXT:  .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
index e7e109170d6a1..338084295fc7f 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) {
 ; CHECK-NEXT:    mov w9, wzr
 ; CHECK-NEXT:  LBB0_1: ; %.thread
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsr w11, w9, #1
 ; CHECK-NEXT:    sub w10, w9, #1
-; CHECK-NEXT:    mov w9, w11
+; CHECK-NEXT:    lsr w9, w9, #1
 ; CHECK-NEXT:    tbnz w10, #0, LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb343
 ; CHECK-NEXT:    and w9, w10, #0x1
-; CHECK-NEXT:    mov w0, #-1
+; CHECK-NEXT:    mov w0, #-1 ; =0xffffffff
 ; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:    ret
 bb:
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index b947c943ba448..72f6646930624 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_f16
@@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f32
@@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_f64
@@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
@@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
@@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
@@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
@@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
@@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
@@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
@@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
@@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
@@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
@@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
@@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8bf16
@@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4f32
@@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index f2163ad15bafc..df88f37195ed6 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mrs x19, SVCR
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT:    mrs x19, SVCR
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w19, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index 6c6a691760af3..52a77cb396909 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
 define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
 ; CHECK-LABEL: extract_v4i1_nxv32i1_0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    umov w8, v1.b[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    umov w9, v1.b[2]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    umov w8, v0.b[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    umov w8, v1.b[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    umov w8, v1.b[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0)
   ret <4 x i1> %ext
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index e10313773c73e..72994100b2970 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
 define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
 ; CHECK-LABEL: extract_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0)
   ret <4 x i1> %mask
@@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
 define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
 ; CHECK-LABEL: extract_v8i1_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    umov w9, v1.h[2]
+; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.b[1], w8
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    mov v0.b[2], w8
 ; CHECK-NEXT:    umov w8, v1.h[3]
-; CHECK-NEXT:    mov v0.b[2], w9
-; CHECK-NEXT:    umov w9, v1.h[4]
 ; CHECK-NEXT:    mov v0.b[3], w8
+; CHECK-NEXT:    umov w8, v1.h[4]
+; CHECK-NEXT:    mov v0.b[4], w8
 ; CHECK-NEXT:    umov w8, v1.h[5]
-; CHECK-NEXT:    mov v0.b[4], w9
-; CHECK-NEXT:    umov w9, v1.h[6]
 ; CHECK-NEXT:    mov v0.b[5], w8
+; CHECK-NEXT:    umov w8, v1.h[6]
+; CHECK-NEXT:    mov v0.b[6], w8
 ; CHECK-NEXT:    umov w8, v1.h[7]
-; CHECK-NEXT:    mov v0.b[6], w9
 ; CHECK-NEXT:    mov v0.b[7], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0)
   ret <8 x i1> %mask
@@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
 define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
 ; CHECK-LABEL: extract_v16i1_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.b[1], v1.b[1]
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov v0.b[1], v0.b[1]
 ; CHECK-NEXT:    mov v0.b[2], v1.b[2]
 ; CHECK-NEXT:    mov v0.b[3], v1.b[3]
 ; CHECK-NEXT:    mov v0.b[4], v1.b[4]
@@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
 ; CHECK-NEXT:    mov v0.b[13], v1.b[13]
 ; CHECK-NEXT:    mov v0.b[14], v1.b[14]
 ; CHECK-NEXT:    mov v0.b[15], v1.b[15]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1> %inmask, i64 0)
   ret <16 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
index 41e4a38fad90b..8e807cda7166d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov w9, v1.s[2]
+; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v0.h[1], w8
+; CHECK-NEXT:    mov w8, v1.s[2]
+; CHECK-NEXT:    mov v0.h[2], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
-; CHECK-NEXT:    mov v0.h[2], w9
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %el0 = extractelement <vscale x 4 x i1> %a, i32 0
   %el1 = extractelement <vscale x 4 x i1> %a, i32 1
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index ba4a3a2042305..bd8f432579a08 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tbnz w1, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %vector.body
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    mov v1.b[1], v0.b[1]
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #16
+; CHECK-NEXT:    umov w8, v2.b[8]
+; CHECK-NEXT:    mov v0.b[1], v2.b[1]
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #16
 ; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    mov v1.b[2], v0.b[2]
-; CHECK-NEXT:    mov v2.b[1], v0.b[9]
-; CHECK-NEXT:    mov v1.b[3], v0.b[3]
-; CHECK-NEXT:    mov v2.b[2], v0.b[10]
-; CHECK-NEXT:    mov v1.b[4], v0.b[4]
-; CHECK-NEXT:    mov v2.b[3], v0.b[11]
-; CHECK-NEXT:    mov v1.b[5], v0.b[5]
-; CHECK-NEXT:    mov v2.b[4], v0.b[12]
-; CHECK-NEXT:    mov v1.b[6], v0.b[6]
-; CHECK-NEXT:    mov v2.b[5], v0.b[13]
-; CHECK-NEXT:    mov v1.b[7], v0.b[7]
-; CHECK-NEXT:    mov v2.b[6], v0.b[14]
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    mov v2.b[7], v0.b[15]
-; CHECK-NEXT:    uunpklo z0.h, z3.b
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    mov v0.b[2], v2.b[2]
+; CHECK-NEXT:    mov v1.b[1], v2.b[9]
+; CHECK-NEXT:    mov v0.b[3], v2.b[3]
+; CHECK-NEXT:    mov v1.b[2], v2.b[10]
+; CHECK-NEXT:    mov v0.b[4], v2.b[4]
+; CHECK-NEXT:    mov v1.b[3], v2.b[11]
+; CHECK-NEXT:    mov v0.b[5], v2.b[5]
+; CHECK-NEXT:    mov v1.b[4], v2.b[12]
+; CHECK-NEXT:    mov v0.b[6], v2.b[6]
+; CHECK-NEXT:    mov v1.b[5], v2.b[13]
+; CHECK-NEXT:    mov v0.b[7], v2.b[7]
+; CHECK-NEXT:    mov v1.b[6], v2.b[14]
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    mov v1.b[7], v2.b[15]
+; CHECK-NEXT:    uunpklo z2.h, z3.b
 ; CHECK-NEXT:    uunpklo z3.h, z4.b
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    lsl z2.s, z2.s, #31
 ; CHECK-NEXT:    lsl z3.s, z3.s, #31
-; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    asr z2.s, z2.s, #31
 ; CHECK-NEXT:    asr z3.s, z3.s, #31
-; CHECK-NEXT:    lsl z2.s, z2.s, #31
-; CHECK-NEXT:    cmpne p3.s, p0/z, z1.s, #0
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    cmpne p3.s, p0/z, z0.s, #0
+; CHECK-NEXT:    cmpne p1.s, p0/z, z2.s, #0
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    asr z2.s, z2.s, #31
-; CHECK-NEXT:    cmpne p0.s, p0/z, z2.s, #0
-; CHECK-NEXT:    st1w { z0.s }, p1, [x0, #2, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p2, [x0, #3, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p3, [x0]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    st1w { z2.s }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p2, [x0, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p3, [x0]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x0, #1, mul vl]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 124f81e7864d1..39fe92aae0619 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    cntw x9
+; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w8, w0
-; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    whilelt p0.s, w9, w0
+; CHECK-NEXT:    add w9, w9, w8
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a3..935189dec48ac 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB24_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
@@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x8, #32
 ; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB24_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB25_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x0]
-; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    add x8, x1, #32
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT:    add x10, x1, #48
 ; CHECK-BE-NEXT:    ld1 { v16.4s }, [x1]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x10]
 ; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    ld1 { v20.4s }, [x8]
+; CHECK-BE-NEXT:    ld1 { v20.4s }, [x9]
 ; CHECK-BE-NEXT:    ld1 { v22.4s }, [x1]
-; CHECK-BE-NEXT:    add x8, x0, #96
+; CHECK-BE-NEXT:    add x9, x0, #96
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
 ; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
 ; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
 ; CHECK-BE-NEXT:    ext v24.16b, v18.16b, v18.16b, #8
-; CHECK-BE-NEXT:    add x9, x0, #32
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ext v25.16b, v20.16b, v20.16b, #8
-; CHECK-BE-NEXT:    add x10, x0, #16
+; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
 ; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
@@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
 ; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
 ; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
 ; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
-; CHECK-BE-NEXT:    add x8, x0, #112
+; CHECK-BE-NEXT:    add x9, x0, #112
 ; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
-; CHECK-BE-NEXT:    st1 { v18.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #80
+; CHECK-BE-NEXT:    st1 { v18.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x0, #80
 ; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
-; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #64
-; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    mov x0, x8
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT:    add x0, x0, #64
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    add x8, x8, #16
+; CHECK-BE-NEXT:    st1 { v19.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x0]
 ; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x8]
 ; CHECK-BE-NEXT:    b.ne .LBB25_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB26_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    mov x8, x0
+; CHECK-BE-NEXT:    add x9, x0, #32
 ; CHECK-BE-NEXT:    ld1 { v16.4s }, [x0]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #16
-; CHECK-BE-NEXT:    ld1 { v17.4s }, [x8]
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
-; CHECK-BE-NEXT:    ld1 { v19.4s }, [x10]
+; CHECK-BE-NEXT:    add x10, x0, #48
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    ld1 { v17.4s }, [x9]
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x10]
+; CHECK-BE-NEXT:    ld1 { v19.4s }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v3.16b
@@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    mul v6.4s, v17.4s, v6.4s
 ; CHECK-BE-NEXT:    mul v7.4s, v18.4s, v7.4s
 ; CHECK-BE-NEXT:    mul v4.4s, v19.4s, v4.4s
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x10
-; CHECK-BE-NEXT:    st1 { v6.4s }, [x8]
-; CHECK-BE-NEXT:    st1 { v7.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v6.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v7.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB26_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB28_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    add x0, x0, #16
+; CHECK-BE-NEXT:    add x9, x8, #48
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
@@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    smull2 v5.4s, v3.8h, v0.8h
 ; CHECK-BE-NEXT:    smull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    smull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT:    mov x0, x8
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x8, #32
 ; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x0]
 ; CHECK-BE-NEXT:    b.ne .LBB28_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr

From 067f1556d0ec5a38fdc8b86596d0e3544f4e6239 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 10 Nov 2025 15:50:51 -0500
Subject: [PATCH 07/97] [AMDGPU] remove clamp and omod for trans bf16 insts
 (#165819)

trans bf16 insts do not support clamp and omod
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    | 39 ++++++--
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  8 +-
 llvm/test/CodeGen/AMDGPU/bf16-math.ll         |  5 +-
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s    | 80 ++++++++++++++++
 .../gfx1250_asm_vop3_from_vop1-fake16.s       | 72 --------------
 .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s    | 72 --------------
 .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 96 -------------------
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 96 -------------------
 .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s  | 96 -------------------
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s  | 96 -------------------
 .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt    | 96 -------------------
 .../gfx1250_dasm_vop3_from_vop1_dpp16.txt     | 96 -------------------
 .../gfx1250_dasm_vop3_from_vop1_dpp8.txt      | 96 -------------------
 13 files changed, 121 insertions(+), 827 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 54f57e02ed47e..85adcab55b742 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
 defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>;
 
+
+let HasClamp = 0, HasOMod = 0 in {
+def V_TRANS_BF16_Profile :  VOPProfile <[bf16, bf16, untyped, untyped]>;
+def V_TRANS_BF16_t16_Profile :  VOPProfile_True16 <VOP_BF16_BF16>;
+def V_TRANS_BF16_fake16_Profile :  VOPProfile_Fake16 <VOP_BF16_BF16>;
+}
+
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -527,14 +534,30 @@ defm V_TANH_F16  : VOP1Inst_t16 <"v_tanh_f16",  VOP_F16_F16, int_amdgcn_tanh>;
 }
 
 let SubtargetPredicate = HasBF16TransInsts in {
-defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
-defm V_RCP_BF16  : VOP1Inst_t16 <"v_rcp_bf16",  VOP_BF16_BF16, AMDGPUrcp>;
-defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
-defm V_RSQ_BF16  : VOP1Inst_t16 <"v_rsq_bf16",  VOP_BF16_BF16, AMDGPUrsq>;
-defm V_LOG_BF16  : VOP1Inst_t16 <"v_log_bf16",  VOP_BF16_BF16, AMDGPUlogf16>;
-defm V_EXP_BF16  : VOP1Inst_t16 <"v_exp_bf16",  VOP_BF16_BF16, AMDGPUexpf16>;
-defm V_SIN_BF16  : VOP1Inst_t16 <"v_sin_bf16",  VOP_BF16_BF16, AMDGPUsin>;
-defm V_COS_BF16  : VOP1Inst_t16 <"v_cos_bf16",  VOP_BF16_BF16, AMDGPUcos>;
+defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               int_amdgcn_tanh>;
+defm V_RCP_BF16  : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrcp>;
+defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               any_amdgcn_sqrt>;
+defm V_RSQ_BF16  : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUrsq>;
+defm V_LOG_BF16  : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUlogf16>;
+defm V_EXP_BF16  : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUexpf16>;
+defm V_SIN_BF16  : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUsin>;
+defm V_COS_BF16  : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile,
+                                               V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile,
+                                               AMDGPUcos>;
 }
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8325c628d68d6..6df91a1d438b0 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1357,8 +1357,12 @@ class VOPBinOpClampPat<SDPatternOperator node, Instruction inst, ValueType vt> :
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
   dag src0 = !if(P.HasOMod,
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)),
+    !if(P.HasClamp,
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+        (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers)));
 
   list<dag> ret3 = [(set P.DstVT:$vdst,
     (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 30a78648c186a..39618b05e6c71 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
 define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 ; GCN-LABEL: test_clamp_bf16_folding:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_exp_bf16_e64 v0, v0 clamp
+; GCN-NEXT:    v_exp_bf16_e32 v0, v0
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
   %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
index c393d3e819880..3f6d8feb45df0 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s
@@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2
 // GFX1250-ERR-NEXT:{{^}}                      ^
+
+v_cos_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_cos_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_exp_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_exp_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_log_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_log_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rcp_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rcp_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rsq_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_rsq_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sin_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sin_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                  ^
+
+v_sqrt_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_sqrt_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_tanh_bf16 v1, v2 clamp
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp
+// GFX1250-ERR-NEXT:{{^}}                   ^
+
+v_tanh_bf16 v1, v2 mul:2
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2
+// GFX1250-ERR-NEXT:{{^}}                   ^
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 0931523bbf40c..37ad6eb249da4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null
 v_tanh_bf16_e64 v5, -1
 // GFX1250: v_tanh_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
-v_tanh_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
-v_tanh_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_prng_b32_e64 v5, v1
 // GFX1250: v_prng_b32_e64 v5, v1                   ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null
 v_rcp_bf16_e64 v5, -1
 // GFX1250: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rcp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rcp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sqrt_bf16_e64 v5, v1
 // GFX1250: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null
 v_sqrt_bf16_e64 v5, -1
 // GFX1250: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sqrt_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rsq_bf16_e64 v5, v1
 // GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null
 v_rsq_bf16_e64 v5, -1
 // GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rsq_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_log_bf16_e64 v5, v1
 // GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
 
@@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null
 v_log_bf16_e64 v5, -1
 // GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
-v_log_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
-v_log_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_exp_bf16_e64 v5, v1
 // GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null
 v_exp_bf16_e64 v5, -1
 // GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
-v_exp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
-v_exp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sin_bf16_e64 v5, v1
 // GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null
 v_sin_bf16_e64 v5, -1
 // GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sin_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sin_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cos_bf16_e64 v5, v1
 // GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
 
@@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null
 v_cos_bf16_e64 v5, -1
 // GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
-v_cos_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
-v_cos_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cvt_f32_bf16_e64 v5, v1
 // GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index 5ac9eb47381d6..52f9ba3a99483 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null
 v_tanh_bf16_e64 v5.l, -1
 // GFX1250: v_tanh_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
-v_tanh_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
-v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_tanh_bf16 v5.l, v128.h
 // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null
 v_rcp_bf16_e64 v5.l, -1
 // GFX1250: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rcp_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rcp_bf16 v5.h, v128.h
 // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null
 v_sqrt_bf16_e64 v5.l, -1
 // GFX1250: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sqrt_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sqrt_bf16 v5.h, v128.h
 // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null
 v_rsq_bf16_e64 v5.l, -1
 // GFX1250: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
-v_rsq_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
-v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_rsq_bf16 v5.h, v128.h
 // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null
 v_log_bf16_e64 v5.l, -1
 // GFX1250: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
-v_log_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
-v_log_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_log_bf16 v5.h, v128.h
 // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null
 v_exp_bf16_e64 v5.l, -1
 // GFX1250: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
-v_exp_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
-v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_exp_bf16 v5.h, v128.h
 // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null
 v_sin_bf16_e64 v5.l, -1
 // GFX1250: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
-v_sin_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
-v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_sin_bf16 v5.h, v128.h
 // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
 
@@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null
 v_cos_bf16_e64 v5.l, -1
 // GFX1250: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5.l, 0.5 mul:2
-// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
-v_cos_bf16_e64 v5.l, src_scc mul:4
-// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
-v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2
-// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 v_cos_bf16_e64 v5.h, v128.h
 // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
index b21fca654590a..21077fe4f9f05 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index d1638565a386a..646acf5219d7e 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
index 78afa10b984cb..1907a939b488b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 6ec4d5f48f8b1..35a51dbe9f922 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
-v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-
 v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
index 67747a65ee52a..0b393973b7875 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -4123,18 +4123,10 @@
 # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128                 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
@@ -4159,10 +4151,6 @@
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
@@ -4223,18 +4211,10 @@
 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00
 # GFX1250: v_prng_b32_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
 
-0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
@@ -4259,10 +4239,6 @@
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
@@ -4287,18 +4263,10 @@
 # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
@@ -4323,10 +4291,6 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
@@ -4351,18 +4315,10 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128                ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
@@ -4387,10 +4343,6 @@
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
@@ -4415,18 +4367,10 @@
 # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
@@ -4451,10 +4395,6 @@
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
@@ -4479,18 +4419,10 @@
 # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_log_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
@@ -4515,10 +4447,6 @@
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
@@ -4543,18 +4471,10 @@
 # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
@@ -4579,10 +4499,6 @@
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
@@ -4607,18 +4523,10 @@
 # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
 # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128                 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00]
 
-0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
-# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
-
 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08
-# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
-
 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
@@ -4643,10 +4551,6 @@
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
 
-0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10
-# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
-
 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00
 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
 # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
index 7c29f8ab01a1b..8b26d2a8696e2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -104,18 +104,6 @@
 # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -197,18 +185,6 @@
 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
 # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 
-0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -257,18 +233,6 @@
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -317,18 +281,6 @@
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -377,18 +329,6 @@
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -437,18 +377,6 @@
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -497,18 +425,6 @@
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
@@ -557,18 +473,6 @@
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 
-0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
-
-0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
-
-0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
-
 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
index d26bc46a1f272..15f76c54a1c65 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -34,22 +34,10 @@
 # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
@@ -57,142 +45,58 @@
 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 
-0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
-
 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
-0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
-
-0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05
-# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
-
 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]

From 826cadd5628413a0a92722e0382ba9d968911acc Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <quic_fhossein@quicinc.com>
Date: Mon, 10 Nov 2025 14:55:07 -0600
Subject: [PATCH 08/97] [Hexagon] Clean-up Instrprof test (#166990)

---
 llvm/test/CodeGen/Hexagon/instrprof-custom.ll | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
index 620b2acc49520..1c1965d44541f 100644
--- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
+++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s
 
 ; CHECK-LABEL: test1:
 ; CHECK: {{call my_instrprof_handler|r0 = #999}}
@@ -14,7 +14,4 @@ entry:
 }
 
 ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
-declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1
-
-attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" }
-attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn }
+declare void @llvm.hexagon.instrprof.custom(ptr, i32)

From 17e26411f8b8a404e07a628ba17986a2392c1f79 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Mon, 10 Nov 2025 14:56:59 -0600
Subject: [PATCH 09/97] [bazel][clang] Port #167374: split clang options/driver
 (#167387)

---
 .../llvm-project-overlay/clang/BUILD.bazel    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 36b71dd49ef13..deb56dc0957e9 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1231,6 +1231,7 @@ cc_library(
         ":format",
         ":frontend",
         ":lex",
+        ":options",
         ":rewrite",
         ":support",
         ":tooling_core",
@@ -1507,6 +1508,19 @@ gentbl_cc_library(
     deps = ["//llvm:OptParserTdFiles"],
 )
 
+cc_library(
+    name = "options",
+    srcs = glob(["lib/Options/*.cpp"]),
+    hdrs = glob(["include/clang/Options/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":basic",
+        ":driver_options_inc_gen",
+        ":static_analyzer_checkers_gen",
+        "//llvm:Option",
+    ],
+)
+
 cc_library(
     name = "driver",
     srcs = glob(
@@ -1544,6 +1558,7 @@ cc_library(
         ":config",
         ":driver_options_inc_gen",
         ":lex",
+        ":options",
         ":parse",
         ":static_analyzer_checkers_gen",
         "//llvm:BinaryFormat",
@@ -1700,6 +1715,7 @@ cc_library(
         ":driver_options_inc_gen",
         ":edit",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -1769,6 +1785,7 @@ cc_library(
         ":frontend",
         ":frontend_tool",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -2001,6 +2018,7 @@ cc_library(
         ":extract_api",
         ":frontend",
         ":frontend_rewrite",
+        ":options",
         ":static_analyzer_frontend",
         "//llvm:Option",
         "//llvm:Support",
@@ -2176,6 +2194,7 @@ cc_library(
         ":frontend_rewrite",
         ":frontend_tool",
         ":lex",
+        ":options",
         ":parse",
         ":sema",
         ":serialization",
@@ -2258,6 +2277,7 @@ cc_binary(
         ":driver",
         ":frontend",
         ":frontend_rewrite",
+        ":options",
         ":serialization",
         ":static_analyzer_frontend",
         ":tooling",

From a37c4e0fad5289e5fdd017ea5ac6162d71fb73ae Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 13:03:20 -0800
Subject: [PATCH 10/97] [NFC][SpecialCaseList] Hide Section internals in
 private section (#167276)

Preparing to moving most of implementation out of the header file.

* https://github.com/llvm/llvm-project/pull/167280

---------

Co-authored-by: Naveen Seth Hanig <naveen.hanig@outlook.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 clang/lib/Basic/Diagnostic.cpp               |  2 +-
 clang/lib/Basic/ProfileList.cpp              |  2 +-
 clang/lib/Basic/SanitizerSpecialCaseList.cpp |  4 ++--
 llvm/include/llvm/Support/SpecialCaseList.h  | 23 +++++++++++++++-----
 llvm/lib/Support/SpecialCaseList.cpp         |  8 +++++++
 5 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 2dec26ecacf26..5e9da245e2b43 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input,
 void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) {
   static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError;
   for (const auto &SectionEntry : sections()) {
-    StringRef DiagGroup = SectionEntry.SectionStr;
+    StringRef DiagGroup = SectionEntry.name();
     if (DiagGroup == "*") {
       // Drop the default section introduced by special case list, we only
       // support exact diagnostic group names.
diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
index 9cb118893a0d9..8727057eb78d1 100644
--- a/clang/lib/Basic/ProfileList.cpp
+++ b/clang/lib/Basic/ProfileList.cpp
@@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList {
 
   bool hasPrefix(StringRef Prefix) const {
     for (const auto &It : sections())
-      if (It.Entries.count(Prefix) > 0)
+      if (It.hasPrefix(Prefix))
         return true;
     return false;
   }
diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
index 56f551628cf89..928c086898097 100644
--- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp
+++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp
@@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() {
     SanitizerMask Mask;
 
 #define SANITIZER(NAME, ID)                                                    \
-  if (S.SectionMatcher.matchAny(NAME))                                         \
+  if (S.matchName(NAME))                                                       \
     Mask |= SanitizerKind::ID;
 #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID)
 
@@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix,
     if (S.Mask & Mask) {
       unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category);
       if (LineNum > 0)
-        return {S.S.FileIdx, LineNum};
+        return {S.S.fileIndex(), LineNum};
     }
   }
   return NotFound;
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index cb8e568de02e0..dcc68b73d0675 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -201,17 +201,22 @@ class SpecialCaseList {
   using SectionEntries = StringMap<StringMap<Matcher>>;
 
 protected:
-  struct Section {
+  class Section {
+  public:
     Section(StringRef Str, unsigned FileIdx, bool UseGlobs)
         : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str),
           FileIdx(FileIdx) {}
 
     Section(Section &&) = default;
 
-    Matcher SectionMatcher;
-    SectionEntries Entries;
-    std::string SectionStr;
-    unsigned FileIdx;
+    // Return name of the section, its entire string in [].
+    StringRef name() const { return SectionStr; }
+
+    // Returns true if string 'Name' matches section name interpreted as a glob.
+    LLVM_ABI bool matchName(StringRef Name) const;
+
+    // Return sequence number of the file where this section is defined.
+    unsigned fileIndex() const { return FileIdx; }
 
     // Helper method to search by Prefix, Query, and Category. Returns
     // 1-based line number on which rule is defined, or 0 if there is no match.
@@ -223,11 +228,19 @@ class SpecialCaseList {
     LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query,
                                        StringRef Category) const;
 
+    /// Returns true if the section has any entries for the given prefix.
+    LLVM_ABI bool hasPrefix(StringRef Prefix) const;
+
   private:
     friend class SpecialCaseList;
     LLVM_ABI void preprocess(bool OrderBySize);
     LLVM_ABI const SpecialCaseList::Matcher *
     findMatcher(StringRef Prefix, StringRef Category) const;
+
+    Matcher SectionMatcher;
+    std::string SectionStr;
+    SectionEntries Entries;
+    unsigned FileIdx;
   };
 
   ArrayRef<const Section> sections() const { return Sections; }
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 246d90cce3a43..465e10ae87588 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -348,6 +348,10 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
   return NotFound;
 }
 
+bool SpecialCaseList::Section::matchName(StringRef Name) const {
+  return SectionMatcher.matchAny(Name);
+}
+
 const SpecialCaseList::Matcher *
 SpecialCaseList::Section::findMatcher(StringRef Prefix,
                                       StringRef Category) const {
@@ -393,4 +397,8 @@ StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix,
   return LongestRule;
 }
 
+bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const {
+  return Entries.find(Prefix) != Entries.end();
+}
+
 } // namespace llvm

From b4a61517a64c6e65294e61a23276c87119f73667 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 10 Nov 2025 22:04:49 +0100
Subject: [PATCH 11/97] [CIR][NFC] Re-land: Add test for Complex imag literal
 GNU extension (#167383)

Re-land: Add test for Complex imag literal GNU extension after updating
the name
---
 clang/test/CIR/CodeGen/complex.cpp | 39 ++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 3fb78dc871904..4eab3999dfc42 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1495,3 +1495,42 @@ void calling_function_that_return_complex() {
 // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
 // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4
 // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4
+
+void imag_literal_gnu_extension() {
+  float _Complex a = 3.0fi;
+  double _Complex b = 3.0i;
+  int _Complex c = 3i;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a", init]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>, ["b", init]
+// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex<!cir.double>
+// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex<!cir.double>, !cir.ptr<!cir.complex<!cir.double>>
+// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4
+// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8
+// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1
+// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8
+// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4

From 540250ca7a58e9fa11d02b0e894166b0e4723f70 Mon Sep 17 00:00:00 2001
From: Kevin Sala Penades <salapenades1@llnl.gov>
Date: Mon, 10 Nov 2025 13:11:03 -0800
Subject: [PATCH 12/97] [OpenMP][Clang] Add codegen support for
 dyn_groupprivate clause (#152830)

This adds the codegen support for the dyn_groupprivate clause.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   51 +-
 .../target_dyn_groupprivate_codegen.cpp       | 2633 +++++++++++++++++
 .../llvm/Frontend/OpenMP/OMPConstants.h       |   10 +
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   23 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   30 +-
 5 files changed, 2718 insertions(+), 29 deletions(-)
 create mode 100644 clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 44ba72c5c76c7..1224fa681cdc0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10013,19 +10013,44 @@ static llvm::Value *emitDeviceID(
   return DeviceID;
 }
 
-static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
-                                      CodeGenFunction &CGF) {
-  llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0);
-
-  if (auto *DynMemClause = D.getSingleClause<OMPXDynCGroupMemClause>()) {
-    CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF);
-    llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr(
-        DynMemClause->getSize(), /*IgnoreResultAssign=*/true);
-    DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty,
-                                             /*isSigned=*/false);
+static std::pair<llvm::Value *, OMPDynGroupprivateFallbackType>
+emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
+  llvm::Value *DynGP = CGF.Builder.getInt32(0);
+  auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
+
+  if (auto *DynGPClause = D.getSingleClause<OMPDynGroupprivateClause>()) {
+    CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
+    llvm::Value *DynGPVal =
+        CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true);
+    DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty,
+                                      /*isSigned=*/false);
+    auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier();
+    switch (FallbackModifier) {
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort:
+      DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
+      break;
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_null:
+      DynGPFallback = OMPDynGroupprivateFallbackType::Null;
+      break;
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem:
+    case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown:
+      // This is the default for dyn_groupprivate.
+      DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem;
+      break;
+    default:
+      llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate");
+    }
+  } else if (auto *OMPXDynCGClause =
+                 D.getSingleClause<OMPXDynCGroupMemClause>()) {
+    CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF);
+    llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(),
+                                                  /*IgnoreResultAssign=*/true);
+    DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
+                                      /*isSigned=*/false);
   }
-  return DynCGroupMem;
+  return {DynGP, DynGPFallback};
 }
+
 static void genMapInfoForCaptures(
     MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
     const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
@@ -10234,7 +10259,7 @@ static void emitTargetCallKernelLaunch(
     llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc());
     llvm::Value *NumIterations =
         OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter);
-    llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF);
+    auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF);
     llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
         CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator());
 
@@ -10244,7 +10269,7 @@ static void emitTargetCallKernelLaunch(
 
     llvm::OpenMPIRBuilder::TargetKernelArgs Args(
         NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
-        DynCGGroupMem, HasNoWait);
+        DynCGroupMem, HasNoWait, DynCGroupMemFallback);
 
     llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
         cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
new file mode 100644
index 0000000000000..758f35d629ace
--- /dev/null
+++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
@@ -0,0 +1,2633 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+// We have 6 target regions
+
+
+
+// Check target registration is registered as a Ctor.
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+
+  #pragma omp target teams dyn_groupprivate(tx(20))
+  {
+  }
+
+  short b = 1;
+  #pragma omp target teams num_teams(b) dyn_groupprivate(1024)
+  {
+    a += b;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+
+  #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32)
+  for (int i = 0; i < n ; ++i) {
+  }
+
+  #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait
+  {
+  }
+
+  return n+1;
+}
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = 1;
+
+    #pragma omp target teams dyn_groupprivate(fallback(null): n-b)
+    {
+      this->a = (double)b + 1.5;
+    }
+
+    #pragma omp target dyn_groupprivate(fallback(abort): 1024)
+    {
+      this->a = 2.5;
+    }
+
+    return (int)a;
+  }
+};
+
+int bar(int n){
+  int a = 0;
+
+  S1 S;
+  a += S.r1(n);
+
+  a += fstatic(n);
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// Check that the offloading functions are emitted and that the parallel function
+// is appropriately guarded.
+
+
+
+
+
+
+#endif
+
+// CHECK1-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]])
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]])
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]])
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP6]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i32 1, ptr [[B]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[A]], ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP3]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP5]], ptr [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP25]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 4, ptr [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 8
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP44]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 1024, ptr [[TMP50]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK1-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK1-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK1:       omp_offload.failed7:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
+// CHECK1:       omp_offload.cont8:
+// CHECK1-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK1-NEXT:    ret i32 [[CONV]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK1-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP4]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP12]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP6]], ptr [[TMP13]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP15]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[TMP16]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP17]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK1-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8
+// CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP34]], align 8
+// CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP36]], align 8
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK1-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP44]], ptr [[TMP46]], align 8
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP47]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK1-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1)
+// CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false)
+// CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false)
+// CHECK1-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK1-NEXT:    ret i32 [[ADD12]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[B:%.*]] = alloca i16, align 2
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP9]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 20, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1:       omp_offload.failed:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK1:       omp_offload.cont:
+// CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP22]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    store i64 [[TMP17]], ptr [[TMP23]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP25]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT:    store i64 [[TMP19]], ptr [[TMP26]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT:    store ptr null, ptr [[TMP27]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP28]], align 8
+// CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT:    store i64 [[TMP21]], ptr [[TMP29]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT:    store ptr null, ptr [[TMP30]], align 8
+// CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 8
+// CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 8
+// CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 8
+// CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8
+// CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP42]], align 8
+// CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP43]], align 8
+// CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 8, ptr [[TMP45]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 1024, ptr [[TMP48]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK1:       omp_offload.failed2:
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
+// CHECK1:       omp_offload.cont3:
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT:    ret i32 [[TMP51]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK1-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1:       .omp.final.then:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK1-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK1:       .omp.final.done:
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1:       omp.precond.then:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1:       omp.inner.for.cond:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1:       omp.body.continue:
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1:       omp.inner.for.inc:
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK1:       omp.inner.for.end:
+// CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK1-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1:       .omp.final.then:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK1-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK1:       .omp.final.done:
+// CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK1:       omp.precond.end:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK1-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK1-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK1-NEXT:    store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK1-NEXT:    store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK1-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK1-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK1:       omp_offload.failed.i:
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]]
+// CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK1:       .omp_outlined..exit:
+// CHECK1-NEXT:    ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK1-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]])
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]])
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]])
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP6]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[B]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[A]], ptr [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP20]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP16]], ptr [[TMP21]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP26]], align 8
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 4, ptr [[TMP27]], align 8
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[THIS1]], ptr [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[A2]], ptr [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP36]], ptr [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP37]], ptr [[TMP41]], align 4
+// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP44]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP46]], align 8
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 0, ptr [[TMP47]], align 8
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 1024, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK3-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK3-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK3:       omp_offload.failed7:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
+// CHECK3:       omp_offload.cont8:
+// CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP53:%.*]] = load double, ptr [[A9]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK3-NEXT:    ret i32 [[CONV]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK3-NEXT:    store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP9]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP6]], ptr [[TMP13]], align 4
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP15]], align 4
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP8]], ptr [[TMP16]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK3-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP18]], ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP19]], ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.3, ptr [[TMP31]], align 4
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP33]], align 4
+// CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP34]], align 4
+// CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP36]], align 8
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK3-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP45]], align 4
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP44]], ptr [[TMP46]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP47]], align 4
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK3-NEXT:    [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1)
+// CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false)
+// CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false)
+// CHECK3-NEXT:    [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK3-NEXT:    [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK3-NEXT:    ret i32 [[ADD12]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[B:%.*]] = alloca i16, align 2
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP8]], align 8
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP9]], align 8
+// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 20, ptr [[TMP12]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3:       omp_offload.failed:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK3:       omp_offload.cont:
+// CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT:    store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT:    store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP22]], align 4
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 [[TMP17]], ptr [[TMP23]], align 4
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP25]], align 4
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 [[TMP19]], ptr [[TMP26]], align 4
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT:    store ptr null, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP28]], align 4
+// CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT:    store i32 [[TMP21]], ptr [[TMP29]], align 4
+// CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr null, ptr [[TMP30]], align 4
+// CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP31]], ptr [[TMP38]], align 4
+// CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP32]], ptr [[TMP39]], align 4
+// CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr @.offload_sizes.7, ptr [[TMP40]], align 4
+// CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP42]], align 4
+// CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP44]], align 8
+// CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 8, ptr [[TMP45]], align 8
+// CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 1024, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK3:       omp_offload.failed2:
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
+// CHECK3:       omp_offload.cont3:
+// CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP51]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3:       omp.precond.then:
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3:       cond.true:
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    br label [[COND_END:%.*]]
+// CHECK3:       cond.false:
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    br label [[COND_END]]
+// CHECK3:       cond.end:
+// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3:       omp.inner.for.cond:
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3:       omp.inner.for.body:
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3:       omp.inner.for.inc:
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK3:       omp.inner.for.end:
+// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3:       omp.loop.exit:
+// CHECK3-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK3-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3:       .omp.final.then:
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK3-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK3:       .omp.final.done:
+// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3:       omp.precond.end:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3:       omp.precond.then:
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3:       cond.true:
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    br label [[COND_END:%.*]]
+// CHECK3:       cond.false:
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    br label [[COND_END]]
+// CHECK3:       cond.end:
+// CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3:       omp.inner.for.cond:
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3:       omp.inner.for.body:
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3:       omp.body.continue:
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3:       omp.inner.for.inc:
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK3:       omp.inner.for.end:
+// CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3:       omp.loop.exit:
+// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK3-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3:       .omp.final.then:
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK3-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK3-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK3-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK3-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK3:       .omp.final.done:
+// CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK3:       omp.precond.end:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[TMP8]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT:    store ptr [[TMP9]], ptr [[TMP10]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]]
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK3-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK3-NEXT:    store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK3-NEXT:    store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK3-NEXT:    store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK3-NEXT:    store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK3-NEXT:    store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK3-NEXT:    store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK3-NEXT:    store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK3:       omp_offload.failed.i:
+// CHECK3-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT:    store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]]
+// CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK3:       .omp_outlined..exit:
+// CHECK3-NEXT:    ret i32 0
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK3-SAME: () #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK3-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9:       omp.precond.then:
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9:       cond.true:
+// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    br label [[COND_END:%.*]]
+// CHECK9:       cond.false:
+// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    br label [[COND_END]]
+// CHECK9:       cond.end:
+// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9:       omp.inner.for.cond:
+// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK9-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9:       omp.inner.for.body:
+// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9:       omp.inner.for.inc:
+// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK9:       omp.inner.for.end:
+// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9:       omp.loop.exit:
+// CHECK9-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
+// CHECK9-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK9-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9:       .omp.final.then:
+// CHECK9-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK9-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK9-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK9:       .omp.final.done:
+// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK9:       omp.precond.end:
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9:       omp.precond.then:
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK9-NEXT:    store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9:       cond.true:
+// CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT:    br label [[COND_END:%.*]]
+// CHECK9:       cond.false:
+// CHECK9-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    br label [[COND_END]]
+// CHECK9:       cond.end:
+// CHECK9-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK9-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9:       omp.inner.for.cond:
+// CHECK9-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK9-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9:       omp.inner.for.body:
+// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9:       omp.body.continue:
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9:       omp.inner.for.inc:
+// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK9-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK9:       omp.inner.for.end:
+// CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9:       omp.loop.exit:
+// CHECK9-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK9-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK9-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9:       .omp.final.then:
+// CHECK9-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK9-NEXT:    [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK9-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK9-NEXT:    [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK9-NEXT:    store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK9:       .omp.final.done:
+// CHECK9-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK9:       omp.precond.end:
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK9-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT:    store double [[ADD]], ptr [[A]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT:    store double 2.500000e+00, ptr [[A]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK9-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK9-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK9-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK9-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK9-NEXT:    [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11:       omp.precond.then:
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11:       cond.true:
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    br label [[COND_END:%.*]]
+// CHECK11:       cond.false:
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    br label [[COND_END]]
+// CHECK11:       cond.end:
+// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11:       omp.inner.for.cond:
+// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11:       omp.inner.for.body:
+// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11:       omp.inner.for.inc:
+// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK11:       omp.inner.for.end:
+// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11:       omp.loop.exit:
+// CHECK11-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
+// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK11-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11:       .omp.final.then:
+// CHECK11-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK11-NEXT:    [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK11-NEXT:    [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT:    store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK11:       .omp.final.done:
+// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK11:       omp.precond.end:
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11:       omp.precond.then:
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK11-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11:       cond.true:
+// CHECK11-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT:    br label [[COND_END:%.*]]
+// CHECK11:       cond.false:
+// CHECK11-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    br label [[COND_END]]
+// CHECK11:       cond.end:
+// CHECK11-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK11-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11:       omp.inner.for.cond:
+// CHECK11-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11:       omp.inner.for.body:
+// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11:       omp.body.continue:
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11:       omp.inner.for.inc:
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK11-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK11:       omp.inner.for.end:
+// CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11:       omp.loop.exit:
+// CHECK11-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK11-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK11-NEXT:    br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11:       .omp.final.then:
+// CHECK11-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK11-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK11-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK11-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK11-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK11:       .omp.final.done:
+// CHECK11-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK11:       omp.precond.end:
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK11-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT:    store double [[ADD]], ptr [[A]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT:    store double 2.500000e+00, ptr [[A]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK11-NEXT:    [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK11-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK11-NEXT:    [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT:    store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK11-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK11-NEXT:    ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT:    ret void
+//
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 7bec7e0c6736d..1ac9ac040468c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -190,6 +190,16 @@ enum class OMPScheduleType {
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
 };
 
+/// The fallback types for the dyn_groupprivate clause.
+enum class OMPDynGroupprivateFallbackType : uint64_t {
+  /// Abort the execution.
+  Abort = 0,
+  /// Return null pointer.
+  Null = 1,
+  /// Allocate from a implementation defined memory space.
+  DefaultMem = 2
+};
+
 // Default OpenMP mapper name suffix.
 inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index fd6b9729658c1..9f77c24d0b27b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2446,20 +2446,24 @@ class OpenMPIRBuilder {
     /// The number of threads.
     ArrayRef<Value *> NumThreads;
     /// The size of the dynamic shared memory.
-    Value *DynCGGroupMem = nullptr;
+    Value *DynCGroupMem = nullptr;
     /// True if the kernel has 'no wait' clause.
     bool HasNoWait = false;
+    /// The fallback mechanism for the shared memory.
+    omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+        omp::OMPDynGroupprivateFallbackType::Abort;
 
     // Constructors for TargetKernelArgs.
     TargetKernelArgs() = default;
     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                      Value *NumIterations, ArrayRef<Value *> NumTeams,
-                     ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
-                     bool HasNoWait)
+                     ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
+                     bool HasNoWait,
+                     omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
           NumIterations(NumIterations), NumTeams(NumTeams),
-          NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
-          HasNoWait(HasNoWait) {}
+          NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
+          HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
   };
 
   /// Create the kernel args vector used by emitTargetKernel. This function
@@ -3244,6 +3248,10 @@ class OpenMPIRBuilder {
   ///        dependency information as passed in the depend clause
   /// \param HasNowait Whether the target construct has a `nowait` clause or
   ///        not.
+  /// \param DynCGroupMem The size of the dynamic groupprivate memory for each
+  /// cgroup.
+  /// \param DynCGroupMem The fallback mechanism to execute if the requested
+  /// cgroup memory cannot be provided.
   LLVM_ABI InsertPointOrErrorTy createTarget(
       const LocationDescription &Loc, bool IsOffloadEntry,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
@@ -3255,7 +3263,10 @@ class OpenMPIRBuilder {
       TargetBodyGenCallbackTy BodyGenCB,
       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
       CustomMapperCallbackTy CustomMapperCB,
-      const SmallVector<DependData> &Dependencies, bool HasNowait = false);
+      const SmallVector<DependData> &Dependencies, bool HasNowait = false,
+      Value *DynCGroupMem = nullptr,
+      omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+          omp::OMPDynGroupprivateFallbackType::Abort);
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7dc32fda0eed6..18a4f0a08cb5f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -530,7 +530,13 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
   constexpr size_t MaxDim = 3;
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
-  Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+
+  Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
+
+  Value *DynCGroupMemFallbackFlag =
+      Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
+  DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
+  Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
 
   assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
 
@@ -559,7 +565,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
                 Flags,
                 NumTeams3D,
                 NumThreads3D,
-                KernelArgs.DynCGGroupMem};
+                KernelArgs.DynCGroupMem};
 }
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
@@ -8224,7 +8230,8 @@ static void emitTargetCall(
     OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
     const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
+    bool HasNoWait, Value *DynCGroupMem,
+    OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
   // Generate a function call to the host fallback implementation of the target
   // region. This is called by the host when no offload entry was generated for
   // the target region and when the offloading call fails at runtime.
@@ -8360,12 +8367,13 @@ static void emitTargetCall(
                                                    /*isSigned=*/false)
                            : Builder.getInt64(0);
 
-    // TODO: Use correct DynCGGroupMem
-    Value *DynCGGroupMem = Builder.getInt32(0);
+    // Request zero groupprivate bytes by default.
+    if (!DynCGroupMem)
+      DynCGroupMem = Builder.getInt32(0);
 
-    KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
-                                              NumTeamsC, NumThreadsC,
-                                              DynCGGroupMem, HasNoWait);
+    KArgs = OpenMPIRBuilder::TargetKernelArgs(
+        NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
+        HasNoWait, DynCGroupMemFallback);
 
     // Assume no error was returned because TaskBodyCB and
     // EmitTargetCallFallbackCB don't produce any.
@@ -8414,7 +8422,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
     CustomMapperCallbackTy CustomMapperCB,
-    const SmallVector<DependData> &Dependencies, bool HasNowait) {
+    const SmallVector<DependData> &Dependencies, bool HasNowait,
+    Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -8437,7 +8446,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
                    IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
-                   CustomMapperCB, Dependencies, HasNowait);
+                   CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
+                   DynCGroupMemFallback);
   return Builder.saveIP();
 }
 

From 20e1a1248013fcee994f5de13338b96777fe00cd Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 10 Nov 2025 13:25:20 -0800
Subject: [PATCH 13/97] [LLDB] Fix (more) darwin shell tests under ASAN

---
 lldb/test/Shell/helper/build.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py
index a5a7e997be044..1fa8aab92c128 100755
--- a/lldb/test/Shell/helper/build.py
+++ b/lldb/test/Shell/helper/build.py
@@ -804,7 +804,19 @@ def _get_link_command(self):
         args.extend(self._obj_file_names())
 
         if sys.platform == "darwin":
+            # By default, macOS doesn't allow injecting the ASAN
+            # runtime into system processes.
+            system_clang = (
+                subprocess.check_output(["xcrun", "-find", "clang"])
+                .strip()
+                .decode("utf-8")
+            )
+            system_liblto = os.path.join(
+                os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib"
+            )
             args.extend(["-isysroot", self.apple_sdk])
+            args.extend(["-Wl,-lto_library", "-Wl," + system_liblto])
+
         elif self.objc_gnustep_lib:
             args.extend(["-L", self.objc_gnustep_lib, "-lobjc"])
             if sys.platform == "linux":

From fb2fa21bd64a8c21d73438d7ea93dd6f15d15112 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau@amd.com>
Date: Mon, 10 Nov 2025 16:27:55 -0500
Subject: [PATCH 14/97] [AMDGPU] Remove calling conv check on entry function
 (#162080)

It is undefined behavior to call a function with a mismatched calling
convention. Rather than crash on this behavior, it should compile.

This LLVM defect was identified via the AMD Fuzzing project.
---
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  7 --
 llvm/test/CodeGen/AMDGPU/cc-entry.ll          | 69 +++++++++++++++++++
 2 files changed, 69 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/cc-entry.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0ea9add891111..b03d50f2d451d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
 
-        // Avoid crashing on undefined behavior with an illegal call to a
-        // kernel. If a callsite's calling convention doesn't match the
-        // function's, it's undefined behavior. If the callsite calling
-        // convention does match, that would have errored earlier.
-        if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-          report_fatal_error("invalid call to entry function");
-
         auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
           return F == &MF.getFunction();
         };
diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
new file mode 100644
index 0000000000000..d807f321c4009
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_kernel void @entry_fn() {
+; CHECK-LABEL: entry_fn:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_sext_i32_i16 s5, s5
+; CHECK-NEXT:    s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8
+; CHECK-NEXT:    s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+entry:
+  call void @entry_fn()
+  ret void
+}
+
+define void @caller() {
+; CHECK-LABEL: caller:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s0, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-NEXT:    s_add_co_i32 s32, s32, 16
+; CHECK-NEXT:    v_writelane_b32 v40, s0, 2
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_sext_i32_i16 s5, s5
+; CHECK-NEXT:    s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24
+; CHECK-NEXT:    v_mov_b32_e32 v0, v31
+; CHECK-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
+; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-NEXT:    s_mov_b32 s33, s0
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_wait_alu 0xfffe
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @entry_fn()
+  ret void
+}

From 4b9d7e167b2ab6b7cc6d23516fad8c39ea9bc55f Mon Sep 17 00:00:00 2001
From: Jackson Stogel <jtstogel@gmail.com>
Date: Mon, 10 Nov 2025 13:39:34 -0800
Subject: [PATCH 15/97] Reapply "[libc] Return errno from OFD failure paths in
 fcntl." (#166658) (#166846)

The previous implementation in #166252 (rolled back in #166658) caused
buildbot failures due to a bug in an added test. The modified
`UseAfterClose` did not pass a `struct flock` value to `fcntl`:


https://github.com/llvm/llvm-project/blob/2d5170594147b42a37698760d6e0194eec4f1390/libc/test/src/fcntl/fcntl_test.cpp#L175

Which ASAN caught and errored in the `fcntl` implementation when the
unspecified argument was accessed:


https://github.com/llvm/llvm-project/blob/c12cb2892c808af459eaa270b8738a2b375ecc9b/libc/src/__support/OSUtil/linux/fcntl.cpp#L59
---
 libc/src/__support/OSUtil/linux/fcntl.cpp |   2 +-
 libc/test/src/fcntl/fcntl_test.cpp        | 161 +++++++++++++---------
 2 files changed, 100 insertions(+), 63 deletions(-)

diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index bb76eee90efd2..08db4859c6417 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -66,7 +66,7 @@ ErrorOr<int> fcntl(int fd, int cmd, void *arg) {
         LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
     if (ret < 0)
-      return Error(-1);
+      return Error(-ret);
     // Check for overflow, i.e. the offsets are not the same when cast
     // to off_t from off64_t.
     if (static_cast<off_t>(flk64.l_len) != flk64.l_len ||
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index 84feb34e537a0..d008aea54b425 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) {
   ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
 }
 
-TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) {
-  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
-  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
-
-  struct flock flk, svflk;
-  int retVal;
-  int fd =
-      LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(fd, 0);
-
-  flk.l_type = F_RDLCK;
-  flk.l_start = 0;
-  flk.l_whence = SEEK_SET;
-  flk.l_len = 50;
-
-  // copy flk into svflk
-  svflk = flk;
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-  ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked.
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-
-  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
-}
-
-TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) {
-  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
-  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
-
-  struct flock flk, svflk;
-  int retVal;
-  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(fd, 0);
-
-  flk.l_type = F_WRLCK;
-  flk.l_start = 0;
-  flk.l_whence = SEEK_SET;
-  flk.l_len = 0;
-
-  // copy flk into svflk
-  svflk = flk;
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-  ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked.
-
-  retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk);
-  ASSERT_ERRNO_SUCCESS();
-  ASSERT_GT(retVal, -1);
-
-  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
-}
+/* Tests that are common between OFD and traditional variants of fcntl locks. */
+template <int GETLK_CMD, int SETLK_CMD>
+class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest {
+public:
+  void GetLkRead() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+    struct flock flk = {};
+    struct flock svflk = {};
+    int retVal;
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(fd, 0);
+
+    flk.l_type = F_RDLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 50;
+
+    // copy flk into svflk
+    svflk = flk;
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+    ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked.
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  }
+
+  void GetLkWrite() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+
+    struct flock flk = {};
+    struct flock svflk = {};
+    int retVal;
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(fd, 0);
+
+    flk.l_type = F_WRLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 0;
+
+    // copy flk into svflk
+    svflk = flk;
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+    ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked.
+
+    retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk);
+    ASSERT_ERRNO_SUCCESS();
+    ASSERT_GT(retVal, -1);
+
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  }
+
+  void UseAfterClose() {
+    using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+    constexpr const char *TEST_FILE_NAME =
+        "testdata/fcntl_use_after_close.test";
+    const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+    int fd =
+        LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
+    ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+
+    flock flk = {};
+    flk.l_type = F_RDLCK;
+    flk.l_start = 0;
+    flk.l_whence = SEEK_SET;
+    flk.l_len = 50;
+    ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk));
+    ASSERT_ERRNO_EQ(EBADF);
+  }
+};
+
+#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD)                          \
+  using NAME = LibcFcntlCommonLockTests<GETLK_CMD, SETLK_CMD>;                 \
+  TEST_F(NAME, GetLkRead) { GetLkRead(); }                                     \
+  TEST_F(NAME, GetLkWrite) { GetLkWrite(); }                                   \
+  TEST_F(NAME, UseAfterClose) { UseAfterClose(); }                             \
+  static_assert(true, "Require semicolon.")
+
+COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK);
+COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK,
+                  F_OFD_SETLK);
 
 TEST_F(LlvmLibcFcntlTest, UseAfterClose) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;

From 8c86bc89c161f932ea2dfa0ac23b465d65ac048f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 13:54:35 -0800
Subject: [PATCH 16/97] AMDGPU/GlobalISel: Fix AGPR regbank check for
 mfma_scale (#167393)

Fixes regressions with #159493 after 476a6ea9575
---
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 54ba2f8c0d519..dbca5afeca816 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5081,17 +5081,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       unsigned MinNumRegsRequired = DstSize / 32;
 
       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
       OpdsMapping[0] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
 
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
       OpdsMapping[4] =
-          Info->getMinNumAGPRs() >= MinNumRegsRequired
-              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
-              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+          UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+                      : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
 
       OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
       OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI);

From 0767c640437a8ff02e431a350168a13cb9056428 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 10 Nov 2025 21:55:18 +0000
Subject: [PATCH 17/97] [VPlan] Use getDefiningRecipe instead of directly
 accessing Def. (NFC)

Use getDefiningRecipe to future-proof the code. Split off from
https://github.com/llvm/llvm-project/pull/156262 as suggested.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e4303a4c5fff..90696ffc3aca7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -99,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
 
 VPValue::~VPValue() {
   assert(Users.empty() && "trying to delete a VPValue with remaining users");
-  if (Def)
+  if (VPDef *Def = getDefiningRecipe())
     Def->removeDefinedValue(this);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
-  if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
+  if (const VPRecipeBase *R = getDefiningRecipe())
     R->print(OS, "", SlotTracker);
   else
     printAsOperand(OS, SlotTracker);
 }
 
 void VPValue::dump() const {
-  const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
+  const VPRecipeBase *Instr = getDefiningRecipe();
   VPSlotTracker SlotTracker(
       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
   print(dbgs(), SlotTracker);

From a1934ee500dc0bc7f4ad6f989a83eaec90b09597 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 14:04:01 -0800
Subject: [PATCH 18/97] [NFC][SpecialCaseList] Replace callback with return
 value (#165943)

This commit introduces `SpecialCaseList::Match`, a small struct to hold
the matched rule and its line number. This simplifies the `match`
methods by allowing them to return a single value instead of using a
callback.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 llvm/include/llvm/Support/SpecialCaseList.h | 24 +++----
 llvm/lib/Support/SpecialCaseList.cpp        | 71 +++++++++++----------
 2 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index dcc68b73d0675..dee4db584c88b 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -126,15 +126,16 @@ class SpecialCaseList {
   SpecialCaseList &operator=(SpecialCaseList const &) = delete;
 
 private:
+  using Match = std::pair<StringRef, unsigned>;
+  static constexpr Match NotMatched = {"", 0};
+
   // Lagacy v1 matcher.
   class RegexMatcher {
   public:
     LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
     LLVM_ABI void preprocess(bool BySize);
 
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
+    LLVM_ABI Match match(StringRef Query) const;
 
     struct Reg {
       Reg(StringRef Name, unsigned LineNo, Regex &&Rg)
@@ -152,9 +153,7 @@ class SpecialCaseList {
     LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
     LLVM_ABI void preprocess(bool BySize);
 
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
+    LLVM_ABI Match match(StringRef Query) const;
 
     struct Glob {
       Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern)
@@ -168,11 +167,10 @@ class SpecialCaseList {
 
     RadixTree<iterator_range<StringRef::const_iterator>,
               RadixTree<iterator_range<StringRef::const_reverse_iterator>,
-                        SmallVector<const GlobMatcher::Glob *, 1>>>
+                        SmallVector<int, 1>>>
         PrefixSuffixToGlob;
 
-    RadixTree<iterator_range<StringRef::const_iterator>,
-              SmallVector<const GlobMatcher::Glob *, 1>>
+    RadixTree<iterator_range<StringRef::const_iterator>, SmallVector<int, 1>>
         SubstrToGlob;
   };
 
@@ -184,14 +182,10 @@ class SpecialCaseList {
     LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
     LLVM_ABI void preprocess(bool BySize);
 
-    LLVM_ABI void
-    match(StringRef Query,
-          llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
+    LLVM_ABI Match match(StringRef Query) const;
 
     LLVM_ABI bool matchAny(StringRef Query) const {
-      bool R = false;
-      match(Query, [&](StringRef, unsigned) { R = true; });
-      return R;
+      return match(Query) != NotMatched;
     }
 
     std::variant<RegexMatcher, GlobMatcher> M;
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 465e10ae87588..395a55d75acd4 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -15,11 +15,13 @@
 
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -63,12 +65,12 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
   }
 }
 
-void SpecialCaseList::RegexMatcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+SpecialCaseList::Match
+SpecialCaseList::RegexMatcher::match(StringRef Query) const {
   for (const auto &R : reverse(RegExes))
     if (R.Rg.match(Query))
-      return Cb(R.Name, R.LineNo);
+      return {R.Name, R.LineNo};
+  return NotMatched;
 }
 
 Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
@@ -90,7 +92,7 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
     });
   }
 
-  for (const auto &G : reverse(Globs)) {
+  for (const auto &[Idx, G] : enumerate(Globs)) {
     StringRef Prefix = G.Pattern.prefix();
     StringRef Suffix = G.Pattern.suffix();
 
@@ -102,26 +104,29 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
         // But only if substring is not empty. Searching this tree is more
         // expensive.
         auto &V = SubstrToGlob.emplace(Substr).first->second;
-        V.emplace_back(&G);
+        V.emplace_back(Idx);
         continue;
       }
     }
 
     auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
     auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
-    V.emplace_back(&G);
+    V.emplace_back(Idx);
   }
 }
 
-void SpecialCaseList::GlobMatcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+SpecialCaseList::Match
+SpecialCaseList::GlobMatcher::match(StringRef Query) const {
+  int Best = -1;
   if (!PrefixSuffixToGlob.empty()) {
     for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
       for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) {
-        for (const auto *G : V) {
-          if (G->Pattern.match(Query)) {
-            Cb(G->Name, G->LineNo);
+        for (int Idx : reverse(V)) {
+          if (Best > Idx)
+            break;
+          const GlobMatcher::Glob &G = Globs[Idx];
+          if (G.Pattern.match(Query)) {
+            Best = Idx;
             // As soon as we find a match in the vector, we can break for this
             // vector, since the globs are already sorted by priority within the
             // prefix group. However, we continue searching other prefix groups
@@ -138,9 +143,12 @@ void SpecialCaseList::GlobMatcher::match(
     // possibilities. In most cases search will fail on first characters.
     for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) {
       for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) {
-        for (const auto *G : V) {
-          if (G->Pattern.match(Query)) {
-            Cb(G->Name, G->LineNo);
+        for (int Idx : reverse(V)) {
+          if (Best > Idx)
+            break;
+          const GlobMatcher::Glob &G = Globs[Idx];
+          if (G.Pattern.match(Query)) {
+            Best = Idx;
             // As soon as we find a match in the vector, we can break for this
             // vector, since the globs are already sorted by priority within the
             // prefix group. However, we continue searching other prefix groups
@@ -151,6 +159,9 @@ void SpecialCaseList::GlobMatcher::match(
       }
     }
   }
+  if (Best < 0)
+    return NotMatched;
+  return {Globs[Best].Name, Globs[Best].LineNo};
 }
 
 SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
@@ -169,12 +180,11 @@ void SpecialCaseList::Matcher::preprocess(bool BySize) {
   return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
 }
 
-void SpecialCaseList::Matcher::match(
-    StringRef Query,
-    llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+SpecialCaseList::Match SpecialCaseList::Matcher::match(StringRef Query) const {
   if (RemoveDotSlash)
     Query = llvm::sys::path::remove_leading_dotslash(Query);
-  return std::visit([&](auto &V) { return V.match(Query, Cb); }, M);
+  return std::visit(
+      [&](auto &V) -> SpecialCaseList::Match { return V.match(Query); }, M);
 }
 
 // TODO: Refactor this to return Expected<...>
@@ -375,26 +385,17 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
 unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
                                                 StringRef Query,
                                                 StringRef Category) const {
-  unsigned LastLine = 0;
-  if (const Matcher *M = findMatcher(Prefix, Category)) {
-    M->match(Query, [&](StringRef, unsigned LineNo) {
-      LastLine = std::max(LastLine, LineNo);
-    });
-  }
-  return LastLine;
+  if (const Matcher *M = findMatcher(Prefix, Category))
+    return M->match(Query).second;
+  return 0;
 }
 
 StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix,
                                                     StringRef Query,
                                                     StringRef Category) const {
-  StringRef LongestRule;
-  if (const Matcher *M = findMatcher(Prefix, Category)) {
-    M->match(Query, [&](StringRef Rule, unsigned) {
-      if (LongestRule.size() < Rule.size())
-        LongestRule = Rule;
-    });
-  }
-  return LongestRule;
+  if (const Matcher *M = findMatcher(Prefix, Category))
+    return M->match(Query).first;
+  return {};
 }
 
 bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const {

From 046ae855360614c5980d0ced0c55b5de507bd9ad Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 10 Nov 2025 14:17:23 -0800
Subject: [PATCH 19/97] [scudo] Small cleanup of memory tagging code. (#166860)

Make the systemSupportsMemoryTagging() function return even on system
that don't support memory tagging. This avoids the need to always check
if memory tagging is supported before calling th function.

Make systemSupportsMemoryTagging() cache the getauxval return value
instead of calling the function every time.

Updated the code that calls systemSupportsMemoryTagging().
---
 compiler-rt/lib/scudo/standalone/combined.h                | 3 +--
 compiler-rt/lib/scudo/standalone/memtag.h                  | 7 +++----
 compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp     | 1 -
 compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp  | 4 +---
 .../lib/scudo/standalone/tests/wrappers_cpp_test.cpp       | 3 +--
 5 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index ffe9554203241..2d0f4f53d2ac5 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -171,8 +171,7 @@ class Allocator {
       Primary.Options.set(OptionBit::DeallocTypeMismatch);
     if (getFlags()->delete_size_mismatch)
       Primary.Options.set(OptionBit::DeleteSizeMismatch);
-    if (allocatorSupportsMemoryTagging<AllocatorConfig>() &&
-        systemSupportsMemoryTagging())
+    if (systemSupportsMemoryTagging())
       Primary.Options.set(OptionBit::UseMemoryTagging);
 
     QuarantineMaxChunkSize =
diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h
index 83ebe676433eb..52897b3dca6ae 100644
--- a/compiler-rt/lib/scudo/standalone/memtag.h
+++ b/compiler-rt/lib/scudo/standalone/memtag.h
@@ -66,7 +66,8 @@ inline bool systemSupportsMemoryTagging() {
 #ifndef HWCAP2_MTE
 #define HWCAP2_MTE (1 << 18)
 #endif
-  return getauxval(AT_HWCAP2) & HWCAP2_MTE;
+  static bool SupportsMemoryTagging = getauxval(AT_HWCAP2) & HWCAP2_MTE;
+  return SupportsMemoryTagging;
 }
 
 inline bool systemDetectsMemoryTagFaultsTestOnly() {
@@ -261,9 +262,7 @@ inline uptr loadTag(uptr Ptr) {
 
 #else
 
-inline NORETURN bool systemSupportsMemoryTagging() {
-  UNREACHABLE("memory tagging not supported");
-}
+inline bool systemSupportsMemoryTagging() { return false; }
 
 inline NORETURN bool systemDetectsMemoryTagFaultsTestOnly() {
   UNREACHABLE("memory tagging not supported");
diff --git a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
index 09093e11452dd..d0d93316f212e 100644
--- a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
@@ -28,7 +28,6 @@ TEST(MemtagBasicDeathTest, Unsupported) {
   EXPECT_DEATH(untagPointer((uptr)0), "not supported");
   EXPECT_DEATH(extractTag((uptr)0), "not supported");
 
-  EXPECT_DEATH(systemSupportsMemoryTagging(), "not supported");
   EXPECT_DEATH(systemDetectsMemoryTagFaultsTestOnly(), "not supported");
   EXPECT_DEATH(enableSystemMemoryTaggingTestOnly(), "not supported");
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 855a3e6e6109f..8741c8299b57c 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -27,9 +27,7 @@
 const scudo::uptr PageSize = scudo::getPageSizeCached();
 
 template <typename Config> static scudo::Options getOptionsForConfig() {
-  if (!Config::getMaySupportMemoryTagging() ||
-      !scudo::archSupportsMemoryTagging() ||
-      !scudo::systemSupportsMemoryTagging())
+  if (!scudo::systemSupportsMemoryTagging())
     return {};
   scudo::AtomicOptions AO;
   AO.set(scudo::OptionBit::UseMemoryTagging);
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
index c802ed22fbad0..84359251a02aa 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
@@ -187,8 +187,7 @@ TEST_F(ScudoWrappersCppTest, ThreadedNew) {
   // TODO: Investigate why libc sometimes crashes with tag missmatch in
   // __pthread_clockjoin_ex.
   std::unique_ptr<scudo::ScopedDisableMemoryTagChecks> NoTags;
-  if (!SCUDO_ANDROID && scudo::archSupportsMemoryTagging() &&
-      scudo::systemSupportsMemoryTagging())
+  if (!SCUDO_ANDROID && scudo::systemSupportsMemoryTagging())
     NoTags = std::make_unique<scudo::ScopedDisableMemoryTagChecks>();
 
   Ready = false;

From 8b1cc2d5f5c9f44d39e39382e8628f258c1bf219 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 10 Nov 2025 22:18:34 +0000
Subject: [PATCH 20/97] [VPlan] Update canNarrowLoad to check WidenMember0's op
 first (NFCI).

This hardens the code to check based on WideMember0's operands. This
ensures each call will go through the same check. Should be NFC
currently but needed when generalizing in follow-up patches.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f5bef08fafcdc..eab642678702a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4147,13 +4147,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
 /// is defined at \p Idx of a load interleave group.
 static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
                           VPValue *OpV, unsigned Idx) {
-  auto *DefR = OpV->getDefiningRecipe();
-  if (!DefR)
-    return WideMember0->getOperand(OpIdx) == OpV;
-  if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
-    return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV;
-
-  if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
+  VPValue *Member0Op = WideMember0->getOperand(OpIdx);
+  VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
+  if (!Member0OpR)
+    return Member0Op == OpV;
+  if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
+    return !W->getMask() && Member0Op == OpV;
+  if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
     return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
   return false;
 }

From 95db31e7f69e7008f0c570ed30d54d1e418e10f2 Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Mon, 10 Nov 2025 14:19:12 -0800
Subject: [PATCH 21/97] Treat specifying a function in the bbsection profile
 without any directive as noop. (#167359)

---
 .../Inputs/basic-block-sections.funcnames     |  4 +-
 clang/test/CodeGen/basic-block-sections.c     |  6 +-
 .../CodeGen/BasicBlockSectionsProfileReader.h | 16 ++---
 llvm/lib/CodeGen/BasicBlockSections.cpp       | 23 +++----
 .../BasicBlockSectionsProfileReader.cpp       | 16 ++---
 .../CodeGen/X86/basic-block-sections-list.ll  | 62 +++----------------
 .../X86/basic-block-sections-source-drift.ll  |  8 ++-
 7 files changed, 45 insertions(+), 90 deletions(-)

diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
index 329cea9a0adfb..2452ee345fe2f 100644
--- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
+++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames
@@ -1 +1,3 @@
-!world
+v1
+f world
+c 0
diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c
index a61b8dd4ac376..0c21a4cb1442c 100644
--- a/clang/test/CodeGen/basic-block-sections.c
+++ b/clang/test/CodeGen/basic-block-sections.c
@@ -30,8 +30,10 @@ int another(int a) {
 //
 // BB_WORLD: .section .text.world,"ax",@progbits{{$}}
 // BB_WORLD: world:
-// BB_WORLD: .section .text.world,"ax",@progbits,unique
-// BB_WORLD: world.__part.1:
+// BB_ALL: .section .text.world,"ax",@progbits,unique
+// BB_ALL: world.__part.1:
+// BB_LIST: .section .text.split.world,"ax",@progbits
+// BB_LIST: world.cold:
 // BB_ALL: .section .text.another,"ax",@progbits
 // BB_ALL: another.__part.1:
 // BB_LIST-NOT: .section .text.another,"ax",@progbits
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 7b1a5f5019589..ee1f28377f7e4 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -68,17 +68,13 @@ class BasicBlockSectionsProfileReader {
 
   BasicBlockSectionsProfileReader() = default;
 
-  // Returns true if basic block sections profile exist for function \p
-  // FuncName.
+  // Returns true if function \p FuncName is hot based on the basic block
+  // section profile.
   bool isFunctionHot(StringRef FuncName) const;
 
-  // Returns a pair with first element representing whether basic block sections
-  // profile exist for the function \p FuncName, and the second element
-  // representing the basic block sections profile (cluster info) for this
-  // function. If the first element is true and the second element is empty, it
-  // means unique basic block sections are desired for all basic blocks of the
-  // function.
-  std::pair<bool, SmallVector<BBClusterInfo>>
+  // Returns the cluster info for the function \p FuncName. Returns an empty
+  // vector if function has no cluster info.
+  SmallVector<BBClusterInfo>
   getClusterInfoForFunction(StringRef FuncName) const;
 
   // Returns the path clonings for the given function.
@@ -190,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass {
 
   bool isFunctionHot(StringRef FuncName) const;
 
-  std::pair<bool, SmallVector<BBClusterInfo>>
+  SmallVector<BBClusterInfo>
   getClusterInfoForFunction(StringRef FuncName) const;
 
   SmallVector<SmallVector<unsigned>>
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index e317e1c06741f..52e2909bec072 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF,
 // clusters are ordered in increasing order of their IDs, with the "Exception"
 // and "Cold" succeeding all other clusters.
 // FuncClusterInfo represents the cluster information for basic blocks. It
-// maps from BBID of basic blocks to their cluster information. If this is
-// empty, it means unique sections for all basic blocks in the function.
+// maps from BBID of basic blocks to their cluster information.
 static void
 assignSections(MachineFunction &MF,
                const DenseMap<UniqueBBID, BBClusterInfo> &FuncClusterInfo) {
@@ -197,10 +196,8 @@ assignSections(MachineFunction &MF,
   for (auto &MBB : MF) {
     // With the 'all' option, every basic block is placed in a unique section.
     // With the 'list' option, every basic block is placed in a section
-    // associated with its cluster, unless we want individual unique sections
-    // for every basic block in this function (if FuncClusterInfo is empty).
-    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
-        FuncClusterInfo.empty()) {
+    // associated with its cluster.
+    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) {
       // If unique sections are desired for all basic blocks of the function, we
       // set every basic block's section ID equal to its original position in
       // the layout (which is equal to its number). This ensures that basic
@@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) {
   if (BBSectionsType == BasicBlockSection::List &&
       hasInstrProfHashMismatch(MF))
     return false;
-  // Renumber blocks before sorting them. This is useful for accessing the
-  // original layout positions and finding the original fallthroughs.
-  MF.RenumberBlocks();
 
   DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo;
   if (BBSectionsType == BasicBlockSection::List) {
-    auto [HasProfile, ClusterInfo] =
-        getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
-            .getClusterInfoForFunction(MF.getName());
-    if (!HasProfile)
+    auto ClusterInfo = getAnalysis<BasicBlockSectionsProfileReaderWrapperPass>()
+                           .getClusterInfoForFunction(MF.getName());
+    if (ClusterInfo.empty())
       return false;
     for (auto &BBClusterInfo : ClusterInfo) {
       FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo);
     }
   }
 
+  // Renumber blocks before sorting them. This is useful for accessing the
+  // original layout positions and finding the original fallthroughs.
+  MF.RenumberBlocks();
+
   MF.setBBSectionsType(BBSectionsType);
   assignSections(MF, FuncClusterInfo);
 
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index 485b44ae4c4aa..c234c0f1b0b34 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const {
 }
 
 bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const {
-  return getClusterInfoForFunction(FuncName).first;
+  return !getClusterInfoForFunction(FuncName).empty();
 }
 
-std::pair<bool, SmallVector<BBClusterInfo>>
+SmallVector<BBClusterInfo>
 BasicBlockSectionsProfileReader::getClusterInfoForFunction(
     StringRef FuncName) const {
   auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
-  return R != ProgramPathAndClusterInfo.end()
-             ? std::pair(true, R->second.ClusterInfo)
-             : std::pair(false, SmallVector<BBClusterInfo>());
+  return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo
+                                              : SmallVector<BBClusterInfo>();
 }
 
 SmallVector<SmallVector<unsigned>>
 BasicBlockSectionsProfileReader::getClonePathsForFunction(
     StringRef FuncName) const {
-  return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths;
+  auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
+  return R != ProgramPathAndClusterInfo.end()
+             ? R->second.ClonePaths
+             : SmallVector<SmallVector<unsigned>>();
 }
 
 uint64_t BasicBlockSectionsProfileReader::getEdgeCount(
@@ -494,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot(
   return BBSPR.isFunctionHot(FuncName);
 }
 
-std::pair<bool, SmallVector<BBClusterInfo>>
+SmallVector<BBClusterInfo>
 BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction(
     StringRef FuncName) const {
   return BBSPR.getClusterInfoForFunction(FuncName);
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
index 45ef452f4f5c1..d652a540f3e9c 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
@@ -1,17 +1,13 @@
-;; Check the basic block sections list option.
-;; version 0 profile:
-; RUN: echo '!_Z3foob' > %t1
+;; Check that specifying the function in the basic block sections profile
+;; without any other directives is a noop.
 ;;
-;; version 1 profile:
-; RUN: echo 'v1' > %t2
-; RUN: echo 'f _Z3foob' >> %t2
+;; Specify the bb sections profile:
+; RUN: echo 'v1' > %t
+; RUN: echo 'f _Z3foob' >> %t
 ;;
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t  > %bbsections
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig
+; RUN: diff -u %orig %bbsections
 
 define i32 @_Z3foob(i1 zeroext %0) nounwind {
   %2 = alloca i32, align 4
@@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind {
 
 declare i32 @_Z3barv() #1
 declare i32 @_Z3bazv() #1
-
-define i32 @_Z3zipb(i1 zeroext %0) nounwind {
-  %2 = alloca i32, align 4
-  %3 = alloca i8, align 1
-  %4 = zext i1 %0 to i8
-  store i8 %4, ptr %3, align 1
-  %5 = load i8, ptr %3, align 1
-  %6 = trunc i8 %5 to i1
-  %7 = zext i1 %6 to i32
-  %8 = icmp sgt i32 %7, 0
-  br i1 %8, label %9, label %11
-
-9:                                                ; preds = %1
-  %10 = call i32 @_Z3barv()
-  store i32 %10, ptr %2, align 4
-  br label %13
-
-11:                                               ; preds = %1
-  %12 = call i32 @_Z3bazv()
-  store i32 %12, ptr %2, align 4
-  br label %13
-
-13:                                               ; preds = %11, %9
-  %14 = load i32, ptr %2, align 4
-  ret i32 %14
-}
-
-; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section        .text._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: .section        .text.hot._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.1:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.2:
-; LINUX-SECTIONS: .section        .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.3:
-
-; LINUX-SECTIONS-FUNCTION-SECTION: .section     .text._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section     .text{{.*}}._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS: _Z3zipb:
-; LINUX-SECTIONS-NOT: .section        .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits
-; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}:
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
index d481b147662dc..6e0db20ca0492 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
@@ -1,6 +1,8 @@
-; RUN: echo "!foo" > %t.order.txt
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt  | FileCheck --check-prefix=SOURCE-DRIFT %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
+; RUN: echo "v1" > %t
+; RUN: echo "f foo" >> %t
+; RUN: echo "c 0" >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t  | FileCheck --check-prefix=SOURCE-DRIFT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
 
 define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1)  !annotation !1 {
   br i1 %0, label %5, label %3

From d5125b3089ddb4e704752656c011d13b4eab0a2c Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com>
Date: Mon, 10 Nov 2025 14:28:32 -0800
Subject: [PATCH 22/97] [flang][CUDA] Unify element size computation in CUF
 helpers (#167398)

Refactor computeWidth from CUFOpConversion into a shared helper function
computeElementByteSize in CUFCommon.
---
 .../flang/Optimizer/Builder/CUFCommon.h       |  5 ++++
 flang/lib/Optimizer/Builder/CUFCommon.cpp     | 23 +++++++++++++++
 .../Optimizer/Transforms/CUFOpConversion.cpp  | 28 ++-----------------
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index 5c56dd6b695f8..6e2442745f9a0 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
 
 namespace fir {
 class FirOpBuilder;
+class KindMapping;
 } // namespace fir
 
 namespace cuf {
@@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
 
 void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder);
 
+int computeElementByteSize(mlir::Location loc, mlir::Type type,
+                           fir::KindMapping &kindMap,
+                           bool emitErrorOnFailure = true);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index cf7588f275d22..461deb8e4cb55 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -9,6 +9,7 @@
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
     }
   }
 }
+
+int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type,
+                                fir::KindMapping &kindMap,
+                                bool emitErrorOnFailure) {
+  auto eleTy = fir::unwrapSequenceType(type);
+  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)})
+    return t.getWidth() / 8;
+  if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)})
+    return t.getWidth() / 8;
+  if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)})
+    return kindMap.getLogicalBitsize(t.getFKind()) / 8;
+  if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) {
+    int elemSize =
+        mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8;
+    return 2 * elemSize;
+  }
+  if (auto t{mlir::dyn_cast<fir::CharacterType>(eleTy)})
+    return kindMap.getCharacterBitsize(t.getFKind()) / 8;
+  if (emitErrorOnFailure)
+    mlir::emitError(loc, "unsupported type");
+  return 0;
+}
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 8d00272b09f42..5b1b0a2f6feab 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) {
   return false;
 }
 
-static int computeWidth(mlir::Location loc, mlir::Type type,
-                        fir::KindMapping &kindMap) {
-  auto eleTy = fir::unwrapSequenceType(type);
-  if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)})
-    return t.getWidth() / 8;
-  if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)})
-    return t.getWidth() / 8;
-  if (eleTy.isInteger(1))
-    return 1;
-  if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)})
-    return kindMap.getLogicalBitsize(t.getFKind()) / 8;
-  if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) {
-    int elemSize =
-        mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8;
-    return 2 * elemSize;
-  }
-  if (auto t{mlir::dyn_cast_or_null<fir::CharacterType>(eleTy)})
-    return kindMap.getCharacterBitsize(t.getFKind()) / 8;
-  mlir::emitError(loc, "unsupported type");
-  return 0;
-}
-
 struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
       mlir::Value bytes;
       fir::KindMapping kindMap{fir::getKindMapping(mod)};
       if (fir::isa_trivial(op.getInType())) {
-        int width = computeWidth(loc, op.getInType(), kindMap);
+        int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap);
         bytes =
             builder.createIntegerConstant(loc, builder.getIndexType(), width);
       } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(
@@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern<cuf::AllocOp> {
           mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy());
           size = dl->getTypeSizeInBits(structTy) / 8;
         } else {
-          size = computeWidth(loc, seqTy.getEleTy(), kindMap);
+          size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap);
         }
         mlir::Value width =
             builder.createIntegerConstant(loc, builder.getIndexType(), size);
@@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion
             typeConverter->convertType(fir::unwrapSequenceType(dstTy));
         width = dl->getTypeSizeInBits(structTy) / 8;
       } else {
-        width = computeWidth(loc, dstTy, kindMap);
+        width = cuf::computeElementByteSize(loc, dstTy, kindMap);
       }
       mlir::Value widthValue = mlir::arith::ConstantOp::create(
           rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width));

From 7b12a08f5e0d5e615dee2b62f9a68a03e68b6c93 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Mon, 10 Nov 2025 17:32:31 -0500
Subject: [PATCH 23/97] [AArch64] Allow peephole to optimize AND + signed
 compare with 0 (#153608)

This should be the peephole's job. Because and sets V flag to 0, this is
why signed comparisons with 0 are okay to replace with tst. Note this is
only for AArch64, because ANDS on ARM leaves the V flag the same.

Fixes: https://github.com/llvm/llvm-project/issues/154387
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  48 ++-
 .../AArch64/arm64-regress-opt-cmp-signed.mir  |  55 +++
 llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll | 332 ++++++++++++++++++
 3 files changed, 434 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 55a04cca4c394..5e95807a68199 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1780,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) {
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
     return Instr.getOpcode();
 
   case AArch64::ADDWrr:
@@ -1810,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) {
     return AArch64::ANDSWri;
   case AArch64::ANDXri:
     return AArch64::ANDSXri;
+  case AArch64::ANDWrr:
+    return AArch64::ANDSWrr;
+  case AArch64::ANDWrs:
+    return AArch64::ANDSWrs;
+  case AArch64::ANDXrr:
+    return AArch64::ANDSXrr;
+  case AArch64::ANDXrs:
+    return AArch64::ANDSXrs;
+  case AArch64::BICWrr:
+    return AArch64::BICSWrr;
+  case AArch64::BICXrr:
+    return AArch64::BICSXrr;
+  case AArch64::BICWrs:
+    return AArch64::BICSWrs;
+  case AArch64::BICXrs:
+    return AArch64::BICSXrs;
   }
 }
 
@@ -1947,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) {
   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
 }
 
+static bool isANDOpcode(MachineInstr &MI) {
+  unsigned Opc = sForm(MI);
+  switch (Opc) {
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Check if CmpInstr can be substituted by MI.
 ///
 /// CmpInstr can be substituted:
@@ -1984,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
   // 1) MI and CmpInstr set N and V to the same value.
   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
   //    signed overflow occurs, so CmpInstr could still be simplified away.
-  if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
+  // Note that Ands and Bics instructions always clear the V flag.
+  if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
     return false;
 
   AccessKind AccessToCheck = AK_Write;
diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir
new file mode 100644
index 0000000000000..8c31e7c2d1cec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s
+--- |
+  define i32 @test01() nounwind {
+  entry:
+    %0 = select i1 true, i32 1, i32 0
+    %1 = and i32 %0, 65535
+    %2 = icmp sgt i32 %1, 0
+    br i1 %2, label %if.then, label %if.end
+
+  if.then:                                      ; preds = %entry
+    ret i32 1
+
+  if.end:                                       ; preds = %entry
+    ret i32 0
+  }
+...
+---
+name:            test01
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr32common }
+body:             |
+  ; CHECK-LABEL: name: test01
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1
+  ; CHECK-NEXT:   [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 12, %bb.2, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.if.then:
+  ; CHECK-NEXT:   $w0 = MOVi32imm 1
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end:
+  ; CHECK-NEXT:   $w0 = MOVi32imm 0
+  ; CHECK-NEXT:   RET_ReallyLR implicit $w0
+  bb.0.entry:
+    successors: %bb.2.if.end, %bb.1.if.then
+
+    %0 = MOVi32imm 1
+    %1 = ANDWri killed %1, 15
+    $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv
+    Bcc 12, %bb.2.if.end, implicit $nzcv
+
+  bb.1.if.then:
+    $w0 = MOVi32imm 1
+    RET_ReallyLR implicit $w0
+
+  bb.2.if.end:
+    $w0 = MOVi32imm 0
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
index 33c5ba7987974..8297fa2d4e3f9 100644
--- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
+++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
@@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) {
   ret i1 %3
 }
 
+define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp eq i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp sgt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    tst w9, w8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp slt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %5 = and i32 %3, %4
+  %6 = icmp sle i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint_inverse_4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bic w8, w9, w8
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp sle i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp eq i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint2_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp sgt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) {
+; CHECK-LABEL: test_disjoint3_inverse:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr w9, w2, #0x800000
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bics wzr, w9, w8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i32 %2, 8388608
+  %4 = shl nuw i32 1, %1
+  %not = xor i32 %4, -1
+  %5 = and i32 %3, %not
+  %6 = icmp slt i32 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp eq i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint2_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp sgt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint3_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    tst x9, x8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp slt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint4_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    and x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %5 = and i64 %3, %4
+  %6 = icmp sle i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_inverse_4_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bic x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, lt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp sle i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp eq i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint2_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp sgt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
+define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) {
+; CHECK-LABEL: test_disjoint3_inverse_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    orr x9, x2, #0x80000000000000
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bics xzr, x9, x8
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    orr w8, w0, w8
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+entry:
+  %3 = or disjoint i64 %2, 36028797018963968
+  %4 = shl nuw i64 1, %1
+  %not = xor i64 %4, -1
+  %5 = and i64 %3, %not
+  %6 = icmp slt i64 %5, 0
+  %7 = select i1 %0, i1 true, i1 %6
+  ret i1 %7
+}
+
 ; negative test
 define i1 @lt3_u8(i8 %0) {
 ; CHECK-LABEL: lt3_u8:

From bf3b704c60cc521b79ec54bd57fcf72368178a52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Susan=20Tan=20=28=E3=82=B9-=E3=82=B6=E3=83=B3=E3=80=80?=
 =?UTF-8?q?=E3=82=BF=E3=83=B3=29?= <zujunt@nvidia.com>
Date: Mon, 10 Nov 2025 17:33:43 -0500
Subject: [PATCH 24/97] [flang][NFC] Characterize allocation based on MemAlloc
 effect instead of pattern matching (#166806)

Flang alias analysis used to find allocation site by pattern matching
allocation ops in mainly FIR dialect. This MR extends the
characterization to instead characterize based on whether the result of
an op has MemAlloc effect.
---
 .../include/flang/Optimizer/Dialect/FIROps.td |  12 +-
 .../lib/Optimizer/Analysis/AliasAnalysis.cpp  |  57 ++++++--
 flang/test/Driver/tco-emit-final-mlir.fir     |   2 +
 flang/test/Fir/alloc.fir                      |   9 ++
 .../test/Fir/omp-reduction-embox-codegen.fir  |   2 +
 flang/test/Fir/pdt.fir                        |   6 +-
 flang/test/HLFIR/inline-hlfir-copy-in.fir     |   2 +-
 flang/test/Lower/Intrinsics/c_f_pointer.f90   |   1 -
 flang/test/Lower/Intrinsics/system_clock.f90  |   2 -
 flang/test/Lower/allocatables.f90             |   6 +-
 .../test/Lower/character-local-variables.f90  |  11 ++
 flang/test/Lower/derived-types.f90            |   2 +
 flang/test/Lower/do_loop_unstructured.f90     |   1 +
 flang/test/Lower/forall/array-pointer.f90     |   1 -
 .../test/Lower/forall/forall-allocatable.f90  |  17 ++-
 flang/test/Lower/loops.f90                    |   1 -
 flang/test/Lower/polymorphic.f90              |   4 -
 flang/test/Lower/statement-function.f90       |   1 -
 flang/test/Transforms/stack-arrays.fir        | 131 +++++++++++-------
 19 files changed, 172 insertions(+), 96 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index bae52d63fda45..289c79bd9b831 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -80,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint<
 // Memory SSA operations
 //===----------------------------------------------------------------------===//
 
-def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
-    MemoryEffects<[MemAlloc<AutomaticAllocationScopeResource>]>]> {
+def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> {
   let summary = "allocate storage for a temporary on the stack given a type";
   let description = [{
     This primitive operation is used to allocate an object on the stack.  A
@@ -162,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
     Variadic<AnyIntegerType>:$shape
   );
 
-  let results = (outs fir_ReferenceType);
+  let results =
+      (outs Res<fir_ReferenceType,
+                "", [MemAlloc<AutomaticAllocationScopeResource>]>:$res);
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
@@ -212,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments,
   }];
 }
 
-def fir_AllocMemOp : fir_Op<"allocmem",
-    [MemoryEffects<[MemAlloc<DefaultResource>]>, AttrSizedOperandSegments]> {
+def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> {
   let summary = "allocate storage on the heap for an object of a given type";
 
   let description = [{
@@ -235,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem",
     Variadic<AnyIntegerType>:$typeparams,
     Variadic<AnyIntegerType>:$shape
   );
-  let results = (outs fir_HeapType);
+  let results = (outs Res<fir_HeapType, "", [MemAlloc<DefaultResource>]>:$res);
 
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 73ddd1ff80126..ef9894232b409 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -27,6 +27,26 @@ using namespace mlir;
 
 #define DEBUG_TYPE "fir-alias-analysis"
 
+// Inspect for value-scoped Allocate effects and determine whether
+// 'candidate' is a new allocation. Returns SourceKind::Allocate if a
+// MemAlloc effect is attached
+static fir::AliasAnalysis::SourceKind
+classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) {
+  if (!op)
+    return fir::AliasAnalysis::SourceKind::Unknown;
+  auto interface = llvm::dyn_cast<mlir::MemoryEffectOpInterface>(op);
+  if (!interface)
+    return fir::AliasAnalysis::SourceKind::Unknown;
+  llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 4> effects;
+  interface.getEffects(effects);
+  for (mlir::MemoryEffects::EffectInstance &e : effects) {
+    if (mlir::isa<mlir::MemoryEffects::Allocate>(e.getEffect()) &&
+        e.getValue() && e.getValue() == candidate)
+      return fir::AliasAnalysis::SourceKind::Allocate;
+  }
+  return fir::AliasAnalysis::SourceKind::Unknown;
+}
+
 //===----------------------------------------------------------------------===//
 // AliasAnalysis: alias
 //===----------------------------------------------------------------------===//
@@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
   mlir::Operation *instantiationPoint{nullptr};
   while (defOp && !breakFromLoop) {
     ty = defOp->getResultTypes()[0];
+    // Value-scoped allocation detection via effects.
+    if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) {
+      type = SourceKind::Allocate;
+      break;
+    }
     llvm::TypeSwitch<Operation *>(defOp)
         .Case<hlfir::AsExprOp>([&](auto op) {
           v = op.getVar();
@@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             defOp = v.getDefiningOp();
           }
         })
-        .Case<fir::AllocaOp, fir::AllocMemOp>([&](auto op) {
-          // Unique memory allocation.
-          type = SourceKind::Allocate;
-          breakFromLoop = true;
-        })
         .Case<fir::ConvertOp>([&](auto op) {
           // Skip ConvertOp's and track further through the operand.
           v = op->getOperand(0);
@@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
               type = SourceKind::Global;
             } else {
               auto def = llvm::cast<mlir::Value>(boxSrc.origin.u);
-              // TODO: Add support to fir.allocmem
-              if (auto allocOp = def.template getDefiningOp<fir::AllocaOp>()) {
-                v = def;
-                defOp = v.getDefiningOp();
-                type = SourceKind::Allocate;
-              } else if (isDummyArgument(def)) {
-                defOp = nullptr;
-                v = def;
-              } else {
-                type = SourceKind::Indirect;
+              bool classified = false;
+              if (auto defDefOp = def.getDefiningOp()) {
+                if (classifyAllocateFromEffects(defDefOp, def) ==
+                    SourceKind::Allocate) {
+                  v = def;
+                  defOp = defDefOp;
+                  type = SourceKind::Allocate;
+                  classified = true;
+                }
+              }
+              if (!classified) {
+                if (isDummyArgument(def)) {
+                  defOp = nullptr;
+                  v = def;
+                } else {
+                  type = SourceKind::Indirect;
+                }
               }
             }
             breakFromLoop = true;
diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir
index 75f8f153127af..7e934c921e773 100644
--- a/flang/test/Driver/tco-emit-final-mlir.fir
+++ b/flang/test/Driver/tco-emit-final-mlir.fir
@@ -15,5 +15,7 @@
 
 func.func @_QPfoo() {
   %1 = fir.alloca i32
+  %0 = arith.constant 0 : i32
+  fir.store %0 to %1 : !fir.ref<i32>
   return
 }
diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir
index 8da8b828c18b9..613c8e274baad 100644
--- a/flang/test/Fir/alloc.fir
+++ b/flang/test/Fir/alloc.fir
@@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() {
   %1 = fir.alloca !fir.class<!fir.array<?xnone>>
   %2 = fir.alloca !fir.box<none>
   %3 = fir.alloca !fir.box<!fir.array<?xnone>>
+  // Add real uses so allocas are not trivially dead.
+  fir.call @__use_class_none(%0) : (!fir.ref<!fir.class<none>>) -> ()
+  fir.call @__use_class_array(%1) : (!fir.ref<!fir.class<!fir.array<?xnone>>>) -> ()
+  fir.call @__use_box_none(%2) : (!fir.ref<!fir.box<none>>) -> ()
+  fir.call @__use_box_array(%3) : (!fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
   return
 }
+func.func private @__use_class_none(!fir.ref<!fir.class<none>>) -> ()
+func.func private @__use_class_array(!fir.ref<!fir.class<!fir.array<?xnone>>>) -> ()
+func.func private @__use_box_none(!fir.ref<!fir.box<none>>) -> ()
+func.func private @__use_box_array(!fir.ref<!fir.box<!fir.array<?xnone>>>) -> ()
 // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not
 // accept box types), so there is no equivalent of
 // alloca_unlimited_polymorphic_box for allocmem.
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
index 1645e1a407ad4..47fffb35a7d92 100644
--- a/flang/test/Fir/omp-reduction-embox-codegen.fir
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
   omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
     omp.terminator
   }
+  func.call @__use_box_i32(%4) : (!fir.ref<!fir.box<i32>>) -> ()
   return
 }
 
+func.func private @__use_box_i32(!fir.ref<!fir.box<i32>>) -> ()
 // basically we are testing that there isn't a crash
 // CHECK-LABEL: define void @_QQmain
 // CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir
index a200cd7e7cc03..04f48e745d033 100644
--- a/flang/test/Fir/pdt.fir
+++ b/flang/test/Fir/pdt.fir
@@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 {
 // end program p
 
 func.func private @bar(!fir.ref<!fir.char<1,?>>)
+func.func private @__use_t1(!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> ()
 
 // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1)
 func.func @_QPfoo(%arg0 : i32, %arg1 : i32) {
   // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1)
   // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]]
   %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32)
-  //%2 = fir.coordinate_of %0, f2 : (!fir.ref<!fir.type<_QTt1>>) -> !fir.ref<!fir.char<1,?>>
-  %2 = fir.zero_bits !fir.ref<!fir.char<1,?>>
-  fir.call @bar(%2) : (!fir.ref<!fir.char<1,?>>) -> ()
+  // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field.
+  func.call @__use_t1(%0) : (!fir.ref<!fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>>) -> ()
   return
 }
diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir
index f3c4b38962a0c..f1da1da9f9a5c 100644
--- a/flang/test/HLFIR/inline-hlfir-copy-in.fir
+++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir
@@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box<!fir.array<?x?x?xf64>> {
 // CHECK:    %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>>
 // CHECK:    %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
 // CHECK:    fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>, !fir.ref<i32>) -> ()
-// CHECK:    hlfir.copy_out %16, %15#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
+// CHECK:    hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref<!fir.box<!fir.array<?xf64>>>, i1) -> ()
 // CHECK:    hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref<i32>, i1
 // CHECK:    return
 // CHECK:  }
diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90
index c1f1d7972d4b1..f54fda42cf51b 100644
--- a/flang/test/Lower/Intrinsics/c_f_pointer.f90
+++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90
@@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower)
 ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2>
 ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
 ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
-! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"}
 ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
 ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
 ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr<!fir.array<?x?xf32>>
diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90
index 9eae3a58884fa..f6fae1113b315 100644
--- a/flang/test/Lower/Intrinsics/system_clock.f90
+++ b/flang/test/Lower/Intrinsics/system_clock.f90
@@ -32,11 +32,9 @@ subroutine system_clock_test()
 
 ! CHECK-LABEL: @_QPss
 subroutine ss(count)
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.alloca !fir.box<!fir.heap<i64>> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"}
   ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca !fir.heap<i64> {uniq_name = "_QFssEcount_max.addr"}
   ! CHECK:   %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap<i64>
   ! CHECK:   fir.store %[[V_2]] to %[[V_1]] : !fir.ref<!fir.heap<i64>>
-  ! CHECK:   %[[V_3:[0-9]+]] = fir.alloca !fir.box<!fir.ptr<i64>> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"}
   ! CHECK:   %[[V_4:[0-9]+]] = fir.alloca !fir.ptr<i64> {uniq_name = "_QFssEcount_rate.addr"}
   ! CHECK:   %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr<i64>
   ! CHECK:   fir.store %[[V_5]] to %[[V_4]] : !fir.ref<!fir.ptr<i64>>
diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90
index e62f92fa0c1c7..60b7de3301c48 100644
--- a/flang/test/Lower/allocatables.f90
+++ b/flang/test/Lower/allocatables.f90
@@ -56,7 +56,7 @@ subroutine foodim1()
   ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 
   deallocate(x)
-  ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+  ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref<!fir.heap<!fir.array<?xf32>>>
   ! CHECK: fir.freemem %[[xAddr1]]
   ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
   ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
@@ -67,10 +67,6 @@ subroutine foodim2()
   ! Test lowering of local allocatable specification
   real, allocatable :: x(:, :)
   ! CHECK-DAG: fir.alloca !fir.heap<!fir.array<?x?xf32>> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"}
-  ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"}
 end subroutine
 
 ! test lowering of character allocatables. Focus is placed on the length handling
diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90
index d5b959eca1ff6..6325229993a25 100644
--- a/flang/test/Lower/character-local-variables.f90
+++ b/flang/test/Lower/character-local-variables.f90
@@ -8,6 +8,7 @@
 subroutine scalar_cst_len()
   character(10) :: c
   ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"}
+  print *, c
 end subroutine
 
 ! CHECK-LABEL: func @_QPscalar_dyn_len
@@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"}
+  print *, c
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_cst_len
 subroutine cst_array_cst_len()
   character(10) :: c(20)
   ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_dyn_len
@@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32
   ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_cst_len
@@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK: func @_QPdyn_array_dyn_len
@@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"}
+  print *, c(1)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_cst_len_lb
 subroutine cst_array_cst_len_lb()
   character(10) :: c(11:30)
   ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb
@@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64
   ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64
   ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb
@@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,10>>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb
@@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n)
   ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index
   ! CHECK: fir.alloca !fir.array<?x!fir.char<1,?>>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"}
+  print *, c(11)
 end subroutine
 
 ! Test that the length of assumed length parameter is correctly deduced in lowering.
@@ -129,4 +139,5 @@ subroutine assumed_length_param(n)
 subroutine scalar_cst_neg_len()
   character(-1) :: c
   ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"}
+  print *, c
 end subroutine
diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90
index 4d36a7632b070..7e36ec0cfe93f 100644
--- a/flang/test/Lower/derived-types.f90
+++ b/flang/test/Lower/derived-types.f90
@@ -35,6 +35,8 @@ subroutine local_derived()
   ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}>
   type(r) :: some_r
   type(c2) :: some_c2
+  print *, some_c2%ch_array(1,1)
+  print *, some_r%x
 end subroutine
 
 ! CHECK-LABEL: func @_QMdPsaved_derived(
diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90
index 3b03850b43bb2..9c7d874a1aac8 100644
--- a/flang/test/Lower/do_loop_unstructured.f90
+++ b/flang/test/Lower/do_loop_unstructured.f90
@@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured()
 subroutine unstructured_do_concurrent
   logical :: success
   do concurrent (i=1:10) local(success)
+    success = .false.
     error stop "fail"
   enddo
 end
diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90
index fd3efed736c39..6b8c5648af29e 100644
--- a/flang/test/Lower/forall/array-pointer.f90
+++ b/flang/test/Lower/forall/array-pointer.f90
@@ -318,7 +318,6 @@ end subroutine s2_3
 ! CHECK-LABEL: func @_QPs2_3(
 ! CHECK-SAME:                %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMarray_of_pointer_testTt{ip:!fir.box<!fir.ptr<i32>>}>>> {fir.bindc_name = "x"}) {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"}
 ! CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "_QFs2_3Ey.addr"}
 ! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"}
 ! CHECK:         %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"}
diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90
index 96cd37ea3ed8a..8e54d282aea4b 100644
--- a/flang/test/Lower/forall/forall-allocatable.f90
+++ b/flang/test/Lower/forall/forall-allocatable.f90
@@ -13,20 +13,19 @@ end subroutine forall_with_allocatable
 ! CHECK-LABEL: func @_QPforall_with_allocatable(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>>{{.*}}) {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"}
-! CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"}
-! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"}
-! CHECK:         %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"}
-! CHECK:         %[[VAL_6:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
-! CHECK:         fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.heap<!fir.array<?xf32>> {uniq_name = "_QFforall_with_allocatableEarr.addr"}
+! CHECK:         %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"}
+! CHECK:         %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"}
+! CHECK:         %[[VAL_5:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+! CHECK:         fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:         %[[VAL_7:.*]] = arith.constant 5 : i32
 ! CHECK:         %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
 ! CHECK:         %[[VAL_9:.*]] = arith.constant 15 : i32
 ! CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
 ! CHECK:         %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref<index>
-! CHECK:         %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref<index>
-! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
+! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref<index>
+! CHECK:         %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref<index>
+! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<?xf32>>>
 ! CHECK:         %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1>
 ! CHECK:         %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.array<?xf32>
 ! CHECK:         %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.array<?xf32>
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 2fea84b03891a..5ee6562733dae 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -90,7 +90,6 @@ subroutine lis(n)
   ! CHECK-DAG: fir.alloca !fir.array<?x?x?xi32>, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"}
   ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"}
   ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"}
-  ! CHECK-DAG: fir.alloca !fir.array<?x?xi32>, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"}
   integer, target    :: a(n,n,n) ! operand via p
   integer            :: r(n,n)   ! result, unspecified locality
   integer            :: s(n,n)   ! shared locality
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index f586380e653a0..bc4eed54282df 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p)
 ! First test is here to have a reference with non polymorphic on both sides.
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>> {fir.bindc_name = "p", fir.target}) {
-! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"}
 ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"}
 ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
@@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p)
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly(
 ! CHECK-SAME: %arg0: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "p", fir.target}) {
-! CHECK: %[[TP:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"}
 ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"}
 ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref<!fir.ptr<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
@@ -1103,11 +1101,9 @@ subroutine class_with_entry(a)
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry(
 ! CHECK-SAME: %[[A:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "a"}) {
-! CHECK: %[[B:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"}
 
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPd(
 ! CHECK-SAME: %[[B:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "b"}) {
-! CHECK: %[[A:.*]] = fir.alloca !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"}
 
   subroutine class_array_with_entry(a)
     class(p1) :: a(:), b(:)
diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90
index cfec06c35baa8..fe07649e669af 100644
--- a/flang/test/Lower/statement-function.f90
+++ b/flang/test/Lower/statement-function.f90
@@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n)
   character(n) :: argc
   character(*) :: c
   ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] :
-  ! CHECK: fir.load %[[arg1]] : !fir.ref<i32>
   ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref<i32>
   ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32
   ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index 4a417ed981ab1..25fc73153003a 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -3,13 +3,17 @@
 // Simplest transformation
 func.func @simple() {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_s = arith.constant 0 : index
+  %c0_i32_s = arith.constant 0 : i32
+  %ref_s = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_s to %elt_s : !fir.ref<i32>
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK: func.func @simple() {
-// CHECK-NEXT: fir.alloca !fir.array<42xi32>
-// CHECK-NEXT: return
-// CHECK-NEXT: }
+// CHECK: func.func @simple()
+// CHECK: fir.alloca !fir.array<42xi32>
+// CHECK: return
 
 // Check fir.must_be_heap allocations are not moved
 func.func @must_be_heap() {
@@ -17,7 +21,7 @@ func.func @must_be_heap() {
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @must_be_heap() {
+// CHECK-LABEL: func.func @must_be_heap()
 // CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true}
 // CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   return
@@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
   }
   return
 }
-// CHECK:      func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}) {
+// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"})
 // CHECK-NEXT:   %[[C42:.*]] = arith.constant 42 : index
 // CHECK-NEXT:   %[[MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"}
 // CHECK-NEXT:   %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref<!fir.logical<4>>
@@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) {
   }
   return
 }
-// CHECK:     func.func @dfa2(%arg0: i1) {
+// CHECK-LABEL: func.func @dfa2(%arg0: i1)
 // CHECK-NEXT:  %[[MEM:.*]] = fir.allocmem !fir.array<1xi8>
 // CHECK-NEXT:  scf.if %arg0 {
 // CHECK-NEXT:    fir.freemem %[[MEM]] : !fir.heap<!fir.array<1xi8>>
@@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) {
   } else {
     fir.freemem %a : !fir.heap<!fir.array<1xi8>>
   }
+  %c0_d3 = arith.constant 0 : index
+  %c0_i8_d3 = arith.constant 0 : i8
+  %ref_d3 = fir.convert %a : (!fir.heap<!fir.array<1xi8>>) -> !fir.ref<!fir.array<1xi8>>
+  %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref<!fir.array<1xi8>>, index) -> !fir.ref<i8>
+  fir.store %c0_i8_d3 to %elt_d3 : !fir.ref<i8>
   return
 }
-// CHECK:     func.func @dfa3(%arg0: i1) {
-// CHECK-NEXT:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
-// CHECK-NEXT:  fir.if %arg0 {
-// CHECK-NEXT:  } else {
-// CHECK-NEXT:  }
-// CHECK-NEXT:  return
-// CHECK-NEXT:  }
+// CHECK: func.func @dfa3(%arg0: i1)
+// CHECK:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
+// CHECK:  return
 
 func.func private @dfa3a_foo(!fir.ref<!fir.array<1xi8>>) -> ()
 func.func private @dfa3a_bar(!fir.ref<!fir.array<1xi8>>) -> ()
@@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) {
   }
   return
 }
-// CHECK:     func.func @dfa3a(%arg0: i1) {
+// CHECK-LABEL: func.func @dfa3a(%arg0: i1)
 // CHECK-NEXT:  %[[MEM:.*]] = fir.alloca !fir.array<1xi8>
 // CHECK-NEXT:  %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<1xi8>>) -> !fir.heap<!fir.array<1xi8>>
 // CHECK-NEXT:  fir.if %arg0 {
@@ -123,13 +128,18 @@ func.func @placement1() {
   // operand is now available
   %4 = fir.allocmem !fir.array<?xi32>, %3
   // ...
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %ref1 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32 to %elt1 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   return
 }
-// CHECK:      func.func @placement1() {
+// CHECK-LABEL: func.func @placement1()
 // CHECK-NEXT:   %[[ARG:.*]] = arith.constant 3 : index
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[ARG]]
-// CHECK-NEXT:   return
+// CHECK:   return
 // CHECK-NEXT: }
 
 // check that if there are no operands, then the alloca is placed early
@@ -140,16 +150,21 @@ func.func @placement2() {
   %3 = arith.addi %1, %2 : index
   %4 = fir.allocmem !fir.array<42xi32>
   // ...
+  %c0_p2 = arith.constant 0 : index
+  %c0_i32_p2 = arith.constant 0 : i32
+  %ref_p2 = fir.convert %4 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_p2 to %elt_p2 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @placement2() {
-// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:   %[[ONE:.*]] = arith.constant 1 : index
-// CHECK-NEXT:   %[[TWO:.*]] = arith.constant 2 : index
-// CHECK-NEXT:   %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
-// CHECK-NEXT:   return
-// CHECK-NEXT: }
+// CHECK-LABEL: func.func @placement2()
+// CHECK:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK:   %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index
+// CHECK:   return
+// CHECK: }
 
 // check that stack allocations which must be placed in loops use stacksave
 func.func @placement3() {
@@ -162,12 +177,17 @@ func.func @placement3() {
     // operand is now available
     %4 = fir.allocmem !fir.array<?xi32>, %3
     // ...
+    %c0 = arith.constant 0 : index
+    %c0_i32 = arith.constant 0 : i32
+    %ref2 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+    %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    fir.store %c0_i32 to %elt2 : !fir.ref<i32>
     fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
     fir.result %3, %c1_i32 : index, i32
   }
   return
 }
-// CHECK:      func.func @placement3() {
+// CHECK-LABEL: func.func @placement3()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -176,7 +196,7 @@ func.func @placement3() {
 // CHECK-NEXT:     %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
 // CHECK-NEXT:     %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr
 // CHECK-NEXT:     %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]]
-// CHECK-NEXT:     llvm.intr.stackrestore %[[SP]] : !llvm.ptr
+// CHECK:     llvm.intr.stackrestore %[[SP]] : !llvm.ptr
 // CHECK-NEXT:     fir.result
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
@@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) {
   // operand is now available
   %4 = fir.allocmem !fir.array<?xi32>, %3
   // ...
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %ref3 = fir.convert %4 : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32 to %elt3 : !fir.ref<i32>
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb2:
   return
 }
-// CHECK:      func.func @placement4(%arg0: i1) {
+// CHECK-LABEL: func.func @placement4(%arg0: i1)
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
@@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) {
 // CHECK-NEXT:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK-NEXT:   %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[C3]]
-// CHECK-NEXT:   llvm.intr.stackrestore %[[SP]] : !llvm.ptr
+// CHECK:   llvm.intr.stackrestore %[[SP]] : !llvm.ptr
 // CHECK-NEXT:   cf.cond_br %arg0, ^bb1, ^bb2
 // CHECK-NEXT: ^bb2:
 // CHECK-NEXT:   return
@@ -230,7 +255,7 @@ func.func @placement5() {
   }
   return
 }
-// CHECK:      func.func @placement5() {
+// CHECK-LABEL: func.func @placement5()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) {
   fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
   cf.br ^bb1
 }
-// CHECK:      func.func @placement6(%arg0: i1) {
+// CHECK-LABEL: func.func @placement6(%arg0: i1)
 // CHECK-NEXT:   %[[c1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32
 // CHECK-NEXT:   %[[c2:.*]] = arith.constant 2 : index
@@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) {
 // Check multiple returns, where the memory is always freed
 func.func @returns(%arg0: i1) {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_ret = arith.constant 0 : index
+  %c0_i32_ret = arith.constant 0 : i32
+  %ref_ret = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_ret to %elt_ret : !fir.ref<i32>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
@@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) {
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK:      func.func @returns(%[[COND:.*]]: i1) {
-// CHECK-NEXT:   %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-LABEL: func.func @returns(
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK:   cf.cond_br %{{.*}}, ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:
 // CHECK-NEXT:   return
 // CHECK-NEXT: ^bb2:
@@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) {
 // Check multiple returns, where the memory is not freed on one branch
 func.func @returns2(%arg0: i1) {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0_ret2 = arith.constant 0 : index
+  %c0_i32_ret2 = arith.constant 0 : i32
+  %ref_ret2 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref<i32>
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
@@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) {
 ^bb2:
   return
 }
-// CHECK:      func.func @returns2(%[[COND:.*]]: i1) {
-// CHECK-NEXT:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32>
-// CHECK-NEXT:   cf.cond_br %[[COND]], ^bb1, ^bb2
+// CHECK-LABEL: func.func @returns2(
+// CHECK:   %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32>
+// CHECK:   cf.cond_br %{{.*}}, ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:
 // CHECK-NEXT:   fir.freemem %[[ALLOC]] : !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   return
@@ -338,7 +373,7 @@ func.func @omp_placement1() {
   }
   return
 }
-// CHECK:      func.func @omp_placement1() {
+// CHECK-LABEL: func.func @omp_placement1()
 // CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
 // CHECK-NEXT:   %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   omp.sections {
@@ -353,19 +388,21 @@ func.func @omp_placement1() {
 // function terminated by stop statement
 func.func @stop_terminator() {
   %0 = fir.allocmem !fir.array<42xi32>
+  %c0 = arith.constant 0 : index
+  %c0_i32_st = arith.constant 0 : i32
+  %ref4 = fir.convert %0 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+  fir.store %c0_i32_st to %elt4 : !fir.ref<i32>
   fir.freemem %0 : !fir.heap<!fir.array<42xi32>>
   %c0_i32 = arith.constant 0 : i32
   %false = arith.constant false
   fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> ()
   fir.unreachable
 }
-// CHECK: func.func @stop_terminator() {
-// CHECK-NEXT: fir.alloca !fir.array<42xi32>
-// CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:  %[[FALSE:.*]] = arith.constant false
-// CHECK-NEXT:  fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> ()
-// CHECK-NEXT:  fir.unreachable
-// CHECK-NEXT: }
+// CHECK-LABEL: func.func @stop_terminator()
+// CHECK: fir.alloca !fir.array<42xi32>
+// CHECK: fir.call @_FortranAStopStatement(
+// CHECK: fir.unreachable
 
 
 // check that stack allocations that use fir.declare which must be placed in loops
@@ -387,7 +424,7 @@ func.func @placement_loop_declare() {
   }
   return
 }
-// CHECK:      func.func @placement_loop_declare() {
+// CHECK-LABEL: func.func @placement_loop_declare()
 // CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
 // CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
@@ -415,7 +452,7 @@ func.func @lookthrough() {
   fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
   return
 }
-// CHECK: func.func @lookthrough() {
+// CHECK-LABEL: func.func @lookthrough()
 // CHECK:     fir.alloca !fir.array<42xi32>
 // CHECK-NOT: fir.freemem
 
@@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() {
 ^bb3:  // pred: ^bb1
   return
 }
-// CHECK: func.func @finding_freemem_in_block() {
+// CHECK-LABEL: func.func @finding_freemem_in_block()
 // CHECK:     fir.alloca !fir.array<?xi32>
 // CHECK-NOT: fir.freemem

From ad9eb0d01649d82b17b68fa66fac2ae0a31744db Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Tue, 11 Nov 2025 01:36:25 +0300
Subject: [PATCH 25/97] Add default empty header filter regex to root
 .clang-tidy (#167386)

After https://github.com/llvm/llvm-project/pull/164165, we emit warnings
from non-system headers by default.
This change only preserves functionality of `clang-tidy` as it was
before the change.
---
 .clang-tidy | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.clang-tidy b/.clang-tidy
index 06bb0f18e9d2e..2cda1b81de808 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,4 @@
+HeaderFilterRegex: ''
 Checks: >
   -*,
   clang-diagnostic-*,

From 11ab23c33db5c990489023c91260435db288efd8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 14:40:39 -0800
Subject: [PATCH 26/97] CodeGen: Keep reference to TargetRegisterInfo in
 TargetInstrInfo (#158224)

Both conceptually belong to the same subtarget, so it should not
be necessary to pass in the context TargetRegisterInfo to any
TargetInstrInfo member. Add this reference so those superfluous
arguments can be removed.

Most targets placed their TargetRegisterInfo as a member
in TargetInstrInfo. A few had this owned by the TargetSubtargetInfo,
so unify all targets to look the same.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 11 ++-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          | 68 ++++++++-----------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  2 +-
 llvm/lib/Target/AMDGPU/R600InstrInfo.cpp      |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  3 +-
 llvm/lib/Target/ARC/ARCInstrInfo.cpp          |  3 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      |  5 +-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h        |  9 ++-
 llvm/lib/Target/ARM/ARMInstrInfo.cpp          |  3 +-
 llvm/lib/Target/ARM/ARMInstrInfo.h            |  2 +-
 llvm/lib/Target/ARM/Thumb1InstrInfo.cpp       |  2 +-
 llvm/lib/Target/ARM/Thumb1InstrInfo.h         |  2 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp       |  2 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.h         |  2 +-
 llvm/lib/Target/AVR/AVRInstrInfo.cpp          |  4 +-
 llvm/lib/Target/BPF/BPFInstrInfo.cpp          |  2 +-
 llvm/lib/Target/CSKY/CSKYInstrInfo.cpp        |  2 +-
 llvm/lib/Target/DirectX/DirectXInstrInfo.cpp  |  2 +-
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |  4 +-
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h    |  5 ++
 llvm/lib/Target/Hexagon/HexagonSubtarget.cpp  |  3 +-
 llvm/lib/Target/Hexagon/HexagonSubtarget.h    |  3 +-
 llvm/lib/Target/Lanai/LanaiInstrInfo.cpp      |  3 +-
 .../Target/LoongArch/LoongArchInstrInfo.cpp   |  4 +-
 .../lib/Target/LoongArch/LoongArchInstrInfo.h |  4 ++
 .../Target/LoongArch/LoongArchSubtarget.cpp   |  2 +-
 .../lib/Target/LoongArch/LoongArchSubtarget.h |  3 +-
 llvm/lib/Target/MSP430/MSP430InstrInfo.cpp    |  3 +-
 llvm/lib/Target/Mips/Mips16InstrInfo.cpp      |  6 +-
 llvm/lib/Target/Mips/Mips16InstrInfo.h        |  2 +-
 llvm/lib/Target/Mips/MipsInstrInfo.cpp        |  5 +-
 llvm/lib/Target/Mips/MipsInstrInfo.h          |  8 ++-
 llvm/lib/Target/Mips/MipsSEInstrInfo.cpp      |  6 +-
 llvm/lib/Target/Mips/MipsSEInstrInfo.h        |  2 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp      |  2 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  5 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  3 +
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp      |  2 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |  3 +-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp      |  2 +-
 llvm/lib/Target/Sparc/SparcInstrInfo.cpp      |  4 +-
 llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp  |  2 +-
 llvm/lib/Target/VE/VEInstrInfo.cpp            |  2 +-
 .../WebAssembly/WebAssemblyInstrInfo.cpp      |  2 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  2 +-
 llvm/lib/Target/XCore/XCoreInstrInfo.cpp      |  2 +-
 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp    |  3 +-
 llvm/unittests/CodeGen/MFCommon.inc           |  4 +-
 llvm/utils/TableGen/InstrInfoEmitter.cpp      |  9 ++-
 50 files changed, 126 insertions(+), 112 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 7010cffe23a11..02cd9256c422a 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -113,15 +113,18 @@ struct ExtAddrMode {
 ///
 class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
 protected:
+  const TargetRegisterInfo &TRI;
+
   /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables
   /// (i.e. the table for the active HwMode). This should be indexed by
   /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands.
   const int16_t *const RegClassByHwMode;
 
-  TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
-                  unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u,
+  TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u,
+                  unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u,
+                  unsigned ReturnOpcode = ~0u,
                   const int16_t *const RegClassByHwModeTable = nullptr)
-      : RegClassByHwMode(RegClassByHwModeTable),
+      : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable),
         CallFrameSetupOpcode(CFSetupOpcode),
         CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode),
         ReturnOpcode(ReturnOpcode) {}
@@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   TargetInstrInfo &operator=(const TargetInstrInfo &) = delete;
   virtual ~TargetInstrInfo();
 
+  const TargetRegisterInfo &getRegisterInfo() const { return TRI; }
+
   static bool isGenericOpcode(unsigned Opc) {
     return Opc <= TargetOpcode::GENERIC_OP_END;
   }
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 3c41bbeb4b327..7c89e51ff86d3 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -60,7 +60,7 @@ TargetInstrInfo::~TargetInstrInfo() = default;
 
 const TargetRegisterClass *
 TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo *TRI) const {
+                             const TargetRegisterInfo * /*RemoveMe*/) const {
   if (OpNum >= MCID.getNumOperands())
     return nullptr;
 
@@ -69,14 +69,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
 
   // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode
   if (OpInfo.isLookupPtrRegClass())
-    return TRI->getPointerRegClass(RegClass);
+    return TRI.getPointerRegClass(RegClass);
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
     return nullptr;
 
   // Otherwise just look it up normally.
-  return TRI->getRegClass(RegClass);
+  return TRI.getRegClass(RegClass);
 }
 
 /// insertNoop - Insert a noop into the instruction stream at the specified
@@ -223,13 +223,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   //   %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1
   SmallVector<unsigned> UpdateImplicitDefIdx;
   if (HasDef && MI.hasImplicitDef()) {
-    const TargetRegisterInfo *TRI =
-        MI.getMF()->getSubtarget().getRegisterInfo();
     for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) {
       Register ImplReg = MO.getReg();
       if ((ImplReg.isVirtual() && ImplReg == Reg0) ||
           (ImplReg.isPhysical() && Reg0.isPhysical() &&
-           TRI->isSubRegisterEq(ImplReg, Reg0)))
+           TRI.isSubRegisterEq(ImplReg, Reg0)))
         UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands());
     }
   }
@@ -425,37 +423,35 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
                                         unsigned SubIdx, unsigned &Size,
                                         unsigned &Offset,
                                         const MachineFunction &MF) const {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!SubIdx) {
-    Size = TRI->getSpillSize(*RC);
+    Size = TRI.getSpillSize(*RC);
     Offset = 0;
     return true;
   }
-  unsigned BitSize = TRI->getSubRegIdxSize(SubIdx);
+  unsigned BitSize = TRI.getSubRegIdxSize(SubIdx);
   // Convert bit size to byte size.
   if (BitSize % 8)
     return false;
 
-  int BitOffset = TRI->getSubRegIdxOffset(SubIdx);
+  int BitOffset = TRI.getSubRegIdxOffset(SubIdx);
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
   Size = BitSize / 8;
   Offset = (unsigned)BitOffset / 8;
 
-  assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
+  assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
 
   if (!MF.getDataLayout().isLittleEndian()) {
-    Offset = TRI->getSpillSize(*RC) - (Offset + Size);
+    Offset = TRI.getSpillSize(*RC) - (Offset + Size);
   }
   return true;
 }
 
-void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I,
-                                    Register DestReg, unsigned SubIdx,
-                                    const MachineInstr &Orig,
-                                    const TargetRegisterInfo &TRI) const {
+void TargetInstrInfo::reMaterialize(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
+    unsigned SubIdx, const MachineInstr &Orig,
+    const TargetRegisterInfo & /*Remove me*/) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
   MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
@@ -726,7 +722,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
   // actual load size is.
   int64_t MemSize = 0;
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   if (Flags & MachineMemOperand::MOStore) {
     MemSize = MFI.getObjectSize(FI);
@@ -735,7 +730,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
       int64_t OpSize = MFI.getObjectSize(FI);
 
       if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) {
-        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
+        unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg);
         if (SubRegSize > 0 && !(SubRegSize % 8))
           OpSize = SubRegSize / 8;
       }
@@ -800,11 +795,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
       // code.
       BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO);
     } else {
-      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, &TRI,
                           Register());
     }
   } else
-    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, &TRI, Register());
 
   return &*--Pos;
 }
@@ -880,8 +875,8 @@ static void transferImplicitOperands(MachineInstr *MI,
   }
 }
 
-void TargetInstrInfo::lowerCopy(MachineInstr *MI,
-                                const TargetRegisterInfo *TRI) const {
+void TargetInstrInfo::lowerCopy(
+    MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const {
   if (MI->allDefsAreDead()) {
     MI->setDesc(get(TargetOpcode::KILL));
     return;
@@ -911,7 +906,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI,
               SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false);
 
   if (MI->getNumOperands() > 2)
-    transferImplicitOperands(MI, TRI);
+    transferImplicitOperands(MI, &TRI);
   MI->eraseFromParent();
 }
 
@@ -1327,8 +1322,7 @@ void TargetInstrInfo::reassociateOps(
   MachineFunction *MF = Root.getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI);
 
   MachineOperand &OpA = Prev.getOperand(OperandIndices[1]);
   MachineOperand &OpB = Root.getOperand(OperandIndices[2]);
@@ -1704,8 +1698,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   // stack slot reference to depend on the instruction that does the
   // modification.
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
+  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI);
 }
 
 // Provide a global flag for disabling the PreRA hazard recognizer that targets
@@ -1738,11 +1731,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
 // Default implementation of getMemOperandWithOffset.
 bool TargetInstrInfo::getMemOperandWithOffset(
     const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset,
-    bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+    bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const {
   SmallVector<const MachineOperand *, 4> BaseOps;
   LocationSize Width = LocationSize::precise(0);
   if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable,
-                                     Width, TRI) ||
+                                     Width, &TRI) ||
       BaseOps.size() != 1)
     return false;
   BaseOp = BaseOps.front();
@@ -1863,7 +1856,6 @@ std::optional<ParamLoadedValue>
 TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
                                      Register Reg) const {
   const MachineFunction *MF = MI.getMF();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});
   int64_t Offset;
   bool OffsetIsScalable;
@@ -1894,7 +1886,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
     // Only describe memory which provably does not escape the function. As
     // described in llvm.org/PR43343, escaped memory may be clobbered by the
     // callee (or by another thread).
-    const auto &TII = MF->getSubtarget().getInstrInfo();
     const MachineFrameInfo &MFI = MF->getFrameInfo();
     const MachineMemOperand *MMO = MI.memoperands()[0];
     const PseudoSourceValue *PSV = MMO->getPseudoValue();
@@ -1905,8 +1896,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
       return std::nullopt;
 
     const MachineOperand *BaseOp;
-    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
-                                      TRI))
+    if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI))
       return std::nullopt;
 
     // FIXME: Scalable offsets are not yet handled in the offset code below.
@@ -2045,7 +2035,7 @@ bool TargetInstrInfo::getInsertSubregInputs(
 // Returns a MIRPrinter comment for this machine operand.
 std::string TargetInstrInfo::createMIROperandComment(
     const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
-    const TargetRegisterInfo *TRI) const {
+    const TargetRegisterInfo * /*RemoveMe*/) const {
 
   if (!MI.isInlineAsm())
     return "";
@@ -2078,12 +2068,8 @@ std::string TargetInstrInfo::createMIROperandComment(
   OS << F.getKindName();
 
   unsigned RCID;
-  if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) {
-    if (TRI) {
-      OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
-    } else
-      OS << ":RC" << RCID;
-  }
+  if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID))
+    OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID));
 
   if (F.isMemKind()) {
     InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID();
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5e95807a68199..66e49491b18e0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -91,7 +91,7 @@ static cl::opt<unsigned> GatherOptSearchLimit(
              "machine-combiner gather pattern optimization"));
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-    : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN,
+    : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
                           AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
       RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3e256cce97afb..01040854e1577 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 #include "R600GenInstrInfo.inc"
 
 R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
-    : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {}
+    : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {}
 
 bool R600InstrInfo::isVector(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9c74c654d8e35..6b397e0834686 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,7 +63,8 @@ static cl::opt<bool> Fix16BitCopies(
   cl::ReallyHidden);
 
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
-    : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+    : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
+                         AMDGPU::ADJCALLSTACKDOWN),
       RI(ST), ST(ST) {
   SchedModel.init(&ST);
 }
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 05bcb3596ac48..2dec6ffd89a75 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -44,7 +44,8 @@ enum TSFlagsConstants {
 void ARCInstrInfo::anchor() {}
 
 ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST)
-    : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {}
+    : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP),
+      RI(ST) {}
 
 static bool isZeroImm(const MachineOperand &Op) {
   return Op.isImm() && Op.getImm() == 0;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 22769dbf38719..b466ca6f19b3e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = {
   { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
 };
 
-ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI)
-    : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI,
+                                   const ARMBaseRegisterInfo &TRI)
+    : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
       Subtarget(STI) {
   for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) {
     if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 2869e7f708046..27f8e3b3170e7 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
 protected:
   // Can be only subclassed.
-  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI,
+                            const ARMBaseRegisterInfo &TRI);
 
   void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                                 unsigned LoadImmOpc, unsigned LoadOpc) const;
@@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+  const ARMBaseRegisterInfo &getRegisterInfo() const {
+    return static_cast<const ARMBaseRegisterInfo &>(
+        TargetInstrInfo::getRegisterInfo());
+  }
+
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index c684de7252e5d..f37054736b730 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -25,7 +25,8 @@
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
 
-ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {}
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI, RI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst ARMInstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h
index 178d7a2c630e4..9feaf1440f2b2 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo {
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
 
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b8c2fd569ead..f95ba6a4d86fb 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb1InstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 68b326c0ebef6..16350a65a6198 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, Register DestReg, Register SrcReg,
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index f5653d459eac8..b66e4075d4cfa 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden,
              cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI(STI) {}
+    : ARMBaseInstrInfo(STI, RI), RI(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb2InstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 1b0bf2d499510..59ef39d442aef 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -58,7 +58,7 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+  const ThumbRegisterInfo &getRegisterInfo() const { return RI; }
 
   MachineInstr *optimizeSelect(MachineInstr &MI,
                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index ce9908597dcac..5e247cb64a991 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -30,8 +30,8 @@
 namespace llvm {
 
 AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI)
-    : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(),
-      STI(STI) {}
+    : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP),
+      RI(), STI(STI) {}
 
 void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 409f8b4c253b8..0e56e658952a2 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -27,7 +27,7 @@
 using namespace llvm;
 
 BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI)
-    : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+    : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
 
 void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index 619a797be6dc7..34a7de8d8ae96 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 #include "CSKYGenInstrInfo.inc"
 
 CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI)
-    : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
+    : CSKYGenInstrInfo(STI, RI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
       STI(STI) {
   v2sf = STI.hasFPUv2SingleFloat();
   v2df = STI.hasFPUv2DoubleFloat();
diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
index bb2efa43d818c..401881d6d0f67 100644
--- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp
@@ -19,6 +19,6 @@
 using namespace llvm;
 
 DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI)
-    : DirectXGenInstrInfo(STI) {}
+    : DirectXGenInstrInfo(STI, RI) {}
 
 DirectXInstrInfo::~DirectXInstrInfo() {}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 55bafdea234fd..dd9f2fa5cf342 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768;
 void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST)
-    : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN,
+    : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN,
                           Hexagon::ADJCALLSTACKUP),
-      Subtarget(ST) {}
+      RegInfo(ST.getHwMode()), Subtarget(ST) {}
 
 namespace llvm {
 namespace HexagonFUnits {
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 48adf82833f51..7a0c77cc3f705 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -23,6 +23,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "HexagonRegisterInfo.h"
+
 #define GET_INSTRINFO_HEADER
 #include "HexagonGenInstrInfo.inc"
 
@@ -36,6 +38,7 @@ class MachineOperand;
 class TargetRegisterInfo;
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  const HexagonRegisterInfo RegInfo;
   const HexagonSubtarget &Subtarget;
 
   enum BundleAttribute {
@@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
 public:
   explicit HexagonInstrInfo(const HexagonSubtarget &ST);
 
+  const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   /// TargetInstrInfo overrides.
 
   /// If the specified machine instruction is a direct
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index a3c8a882c0616..66c8b0a67169d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -76,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
       OptLevel(TM.getOptLevel()),
       CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      RegInfo(getHwMode()), TLInfo(TM, *this),
-      InstrItins(getInstrItineraryForCPU(CPUString)) {
+      TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) {
   Hexagon_MC::addArchSubtarget(this, FS);
   // Beware of the default constructor of InstrItineraryData: it will
   // reset all members to 0.
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 995f66d0551b4..30794f61218a1 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   // The following objects can use the TargetTriple, so they must be
   // declared after it.
   HexagonInstrInfo InstrInfo;
-  HexagonRegisterInfo RegInfo;
   HexagonTargetLowering TLInfo;
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
@@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   }
   const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const HexagonRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const HexagonTargetLowering *getTargetLowering() const override {
     return &TLInfo;
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 02ed1001cd0d3..b3d28565d4f06 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -27,7 +27,8 @@ using namespace llvm;
 #include "LanaiGenInstrInfo.inc"
 
 LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI)
-    : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+    : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN,
+                        Lanai::ADJCALLSTACKUP),
       RegisterInfo() {}
 
 void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 9a35df2f240c4..5eb3bd6d01a64 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -26,9 +26,9 @@ using namespace llvm;
 #include "LoongArchGenInstrInfo.inc"
 
 LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI)
-    : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN,
+    : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN,
                             LoongArch::ADJCALLSTACKUP),
-      STI(STI) {}
+      RegInfo(STI.getHwMode()), STI(STI) {}
 
 MCInst LoongArchInstrInfo::getNop() const {
   return MCInstBuilder(LoongArch::ANDI)
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index e61314c034bdb..d43d229256c13 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -24,9 +24,13 @@ namespace llvm {
 class LoongArchSubtarget;
 
 class LoongArchInstrInfo : public LoongArchGenInstrInfo {
+  const LoongArchRegisterInfo RegInfo;
+
 public:
   explicit LoongArchInstrInfo(const LoongArchSubtarget &STI);
 
+  const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   MCInst getNop() const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
index 3acbe4992273a..76a8ba1c90e50 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp
@@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU,
     : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       FrameLowering(
           initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
-      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {}
+      InstrInfo(*this), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 5e12bafebb0d5..2beff07949daf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
   LoongArchFrameLowering FrameLowering;
   LoongArchInstrInfo InstrInfo;
-  LoongArchRegisterInfo RegInfo;
   LoongArchTargetLowering TLInfo;
   SelectionDAGTargetInfo TSInfo;
 
@@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   }
   const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const LoongArchRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const LoongArchTargetLowering *getTargetLowering() const override {
     return &TLInfo;
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 65b4820752c94..af053b8645b9b 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -26,7 +26,8 @@ using namespace llvm;
 void MSP430InstrInfo::anchor() {}
 
 MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI)
-    : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN,
+                         MSP430::ADJCALLSTACKUP),
       RI() {}
 
 void MSP430InstrInfo::storeRegToStackSlot(
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index aa94f54cdf9a0..69b96cf3fc933 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -37,11 +37,7 @@ using namespace llvm;
 #define DEBUG_TYPE "mips16-instrinfo"
 
 Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {}
-
-const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
-  return RI;
-}
+    : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h
index 1058e8c25fb5b..2834fd3d7eec2 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo {
 public:
   explicit Mips16InstrInfo(const MipsSubtarget &STI);
 
-  const MipsRegisterInfo &getRegisterInfo() const override;
+  const Mips16RegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index bffdffa4af6a0..c879c46e49dd4 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -39,8 +39,9 @@ using namespace llvm;
 // Pin the vtable to this file.
 void MipsInstrInfo::anchor() {}
 
-MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
-    : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI,
+                             const MipsRegisterInfo &RI, unsigned UncondBr)
+    : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
       Subtarget(STI), UncondBrOpc(UncondBr) {}
 
 const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index 2337ae7c079e7..fc9424808756a 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo {
     BT_Indirect    // One indirct branch.
   };
 
-  explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
+  explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI,
+                         unsigned UncondBrOpc);
 
   MCInst getNop() const override;
 
@@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo {
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
-  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+  const MipsRegisterInfo &getRegisterInfo() const {
+    return static_cast<const MipsRegisterInfo &>(
+        TargetInstrInfo::getRegisterInfo());
+  }
 
   virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index dbdbb179a583d..517f489984c27 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) {
 }
 
 MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
-    : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {}
-
-const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
-  return RI;
-}
+    : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index 2b4f55d184b8b..0a7a0e5ac2a56 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo {
 public:
   explicit MipsSEInstrInfo(const MipsSubtarget &STI);
 
-  const MipsRegisterInfo &getRegisterInfo() const override;
+  const MipsSERegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 6840c7ae8faf4..db2d96f5ff532 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 void NVPTXInstrInfo::anchor() {}
 
 NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI)
-    : NVPTXGenInstrInfo(STI), RegInfo() {}
+    : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {}
 
 void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 3014aa6bfe31e..8d9d4c7f8e128 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -89,7 +89,7 @@ static cl::opt<bool> EnableFMARegPressureReduction(
 void PPCInstrInfo::anchor() {}
 
 PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI)
-    : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+    : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
                       /* CatchRetOpcode */ -1,
                       STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b05956b674d18..ce8dd3ba4b7b3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -82,8 +82,9 @@ namespace llvm::RISCV {
 } // end namespace llvm::RISCV
 
 RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI)
-    : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
-      STI(STI) {}
+    : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN,
+                        RISCV::ADJCALLSTACKUP),
+      RegInfo(STI.getHwMode()), STI(STI) {}
 
 #define GET_INSTRINFO_HELPERS
 #include "RISCVGenInstrInfo.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index c5eddb9e90fbf..800af2621b7df 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned {
 };
 
 class RISCVInstrInfo : public RISCVGenInstrInfo {
+  const RISCVRegisterInfo RegInfo;
 
 public:
   explicit RISCVInstrInfo(const RISCVSubtarget &STI);
 
+  const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
   MCInst getNop() const override;
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 715ac4cedc649..3b43be3d15743 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -104,7 +104,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU,
       RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax),
       FrameLowering(
           initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)),
-      InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {
+      InstrInfo(*this), TLInfo(TM, *this) {
   TSInfo = std::make_unique<RISCVSelectionDAGInfo>();
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b4fc8f0d8e76..d5ffa6c812cec 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
 
   RISCVFrameLowering FrameLowering;
   RISCVInstrInfo InstrInfo;
-  RISCVRegisterInfo RegInfo;
   RISCVTargetLowering TLInfo;
 
   /// Initializes using the passed in CPU and feature strings so that we can
@@ -140,7 +139,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
   const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const RISCVRegisterInfo *getRegisterInfo() const override {
-    return &RegInfo;
+    return &InstrInfo.getRegisterInfo();
   }
   const RISCVTargetLowering *getTargetLowering() const override {
     return &TLInfo;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index ba95ad822df75..4f8bf4312a380 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI)
-    : SPIRVGenInstrInfo(STI) {}
+    : SPIRVGenInstrInfo(STI, RI) {}
 
 bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index f66eb9dbee2dc..fcd6cd7f262bc 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -38,8 +38,8 @@ static cl::opt<unsigned>
 void SparcInstrInfo::anchor() {}
 
 SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST)
-    : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST),
-      Subtarget(ST) {}
+    : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+      RI(ST), Subtarget(ST) {}
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
 /// load from a stack slot, return the virtual or physical register number of
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 2e21f27c9032f..23e7e7e314033 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) {
 void SystemZInstrInfo::anchor() {}
 
 SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti)
-    : SystemZGenInstrInfo(sti, -1, -1),
+    : SystemZGenInstrInfo(sti, RI, -1, -1),
       RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(),
          sti.getHwMode()),
       STI(sti) {}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index d5e804afd27fe..bae703bd97ac8 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 void VEInstrInfo::anchor() {}
 
 VEInstrInfo::VEInstrInfo(const VESubtarget &ST)
-    : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
+    : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
 
 static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 343d90e88950f..8b4e4fbbbd1e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 #include "WebAssemblyGenInstrInfo.inc"
 
 WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
-    : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN,
+    : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN,
                               WebAssembly::ADJCALLSTACKUP,
                               WebAssembly::CATCHRET),
       RI(STI.getTargetTriple()) {}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..2c6d1af225b6b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -85,7 +85,7 @@ static cl::opt<unsigned> UndefRegClearance(
 void X86InstrInfo::anchor() {}
 
 X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
-    : X86GenInstrInfo(STI,
+    : X86GenInstrInfo(STI, RI,
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
                                                : X86::ADJCALLSTACKDOWN32),
                       (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 1a9133aad4dd3..80fda3480aa44 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -43,7 +43,7 @@ namespace XCore {
 void XCoreInstrInfo::anchor() {}
 
 XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST)
-    : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
       RI() {}
 
 static bool isZeroImm(const MachineOperand &op) {
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index be69cefb5b78f..6bbebdecd9988 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
 }
 
 XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI)
-    : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP),
+    : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN,
+                         Xtensa::ADJCALLSTACKUP),
       RI(STI), STI(STI) {}
 
 Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 0180ba0a6c163..783267aa05b59 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -76,8 +76,10 @@ public:
 };
 
 class BogusTargetInstrInfo : public TargetInstrInfo {
+  BogusRegisterInfo RegInfo;
+
 public:
-  BogusTargetInstrInfo() : TargetInstrInfo() {}
+  BogusTargetInstrInfo() : TargetInstrInfo(RegInfo) {}
 };
 
 class BogusSubtarget : public TargetSubtargetInfo {
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index ee3cd8c20f8e7..32994c12aa98b 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -1103,7 +1103,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
       Twine ClassName = TargetName + "GenInstrInfo";
       OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
          << "  explicit " << ClassName
-         << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, "
+         << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, "
+            "unsigned CFSetupOpcode = ~0u, "
             "unsigned CFDestroyOpcode = ~0u, "
             "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n"
          << "  ~" << ClassName << "() override = default;\n"
@@ -1157,9 +1158,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
          << TargetName << "InstrComplexDeprecationInfos[];\n";
     Twine ClassName = TargetName + "GenInstrInfo";
     OS << ClassName << "::" << ClassName
-       << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned "
+       << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, "
+          "unsigned CFSetupOpcode, unsigned "
           "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n"
-       << "  : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, "
+       << "  : TargetInstrInfo(TRI, CFSetupOpcode, CFDestroyOpcode, "
+          "CatchRetOpcode, "
           "ReturnOpcode";
     if (NumClassesByHwMode != 0)
       OS << ", " << TargetName

From da996a3805f43333d1d391b223ec756d53f5e830 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 14:43:46 -0800
Subject: [PATCH 27/97] [NFC][SpecialCaseList] Refactor error handling
 (#167277)

Will help in future patches.
---
 llvm/lib/Support/SpecialCaseList.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 395a55d75acd4..beec8b8ef0d5b 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -274,11 +274,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
 
   bool RemoveDotSlash = Version > 2;
 
-  Section *CurrentSection;
-  if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) {
+  auto ErrOrSection = addSection("*", FileIdx, 1, true);
+  if (auto Err = ErrOrSection.takeError()) {
     Error = toString(std::move(Err));
     return false;
   }
+  Section *CurrentSection = ErrOrSection.get();
 
   // This is the current list of prefixes for all existing users matching file
   // path. We may need parametrization in constructor in future.
@@ -300,12 +301,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
         return false;
       }
 
-      if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo,
-                                UseGlobs)
-                         .moveInto(CurrentSection)) {
+      auto ErrOrSection =
+          addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs);
+      if (auto Err = ErrOrSection.takeError()) {
         Error = toString(std::move(Err));
         return false;
       }
+      CurrentSection = ErrOrSection.get();
       continue;
     }
 

From 5b8e86909c94ad81b4c2f0cee50c3433ec318788 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Mon, 10 Nov 2025 14:48:08 -0800
Subject: [PATCH 28/97] [llc] Fix save-stats test in read only directories
 (#167403)

The save stats test would fail downstream due to the tests being run
from a read only directory. The cwd flag would attempt to place the
statistics in that read only directory, causing an error. This patch
ensures that the tests are always run from a temp directory.
---
 llvm/test/tools/llc/save-stats.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll
index 4950625c809cc..a5769f86648dc 100644
--- a/llvm/test/tools/llc/save-stats.ll
+++ b/llvm/test/tools/llc/save-stats.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 
+; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir
 ; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s
 ; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
 ; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s

From c5ce8021da09cfe232abe2e869269297b00fbcdb Mon Sep 17 00:00:00 2001
From: Sterling-Augustine <saugustine@google.com>
Date: Mon, 10 Nov 2025 14:58:26 -0800
Subject: [PATCH 29/97] [libc] fwrite_unlocked: only return errno if an actual
 error occurred. (#167350)

fwrite and friends don't modify errno if no error occurred. Therefore
frite_unlocked's return value shouldn't be constructed from errno
without checking if an error actually occurred.

This fixes an error introduced by
https://github.com/llvm/llvm-project/commit/9e2f73fe9052a4fbf382a06e30b2441c6d99fb7e
---
 libc/src/stdio/printf_core/vfprintf_internal.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 564441d3bf51a..c47a03d741f98 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -51,8 +51,11 @@ LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
                                          size_t nmemb, ::FILE *f) {
   // Need to use system errno in this case, as system write will set this errno
-  // which we need to propagate back into our code.
-  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
+  // which we need to propagate back into our code. fwrite only modifies errno
+  // if there was an error, and errno may have previously been nonzero. Only
+  // return errno if there was an error.
+  size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f);
+  return {members_written, members_written == nmemb ? 0 : errno};
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal

From e5e74e987751c88eeaa453a9a2ef7840f9d44a62 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 15:04:24 -0800
Subject: [PATCH 30/97] AMDGPU: Use getMergedLocation in SILoadStoreOptimizer
 (#156396)

This is merging loads and stores so use the combined DebugLoc.

Not sure if computeBase should be using the merged location from
all the involved instructions. I'm also not sure how to test this
sort of thing.
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    | 66 ++++++++------
 .../AMDGPU/ds-read2-write2-debug-info.ll      | 89 +++++++++++++++++++
 2 files changed, 130 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f0d1117664983..00aae2c910608 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -233,10 +233,11 @@ class SILoadStoreOptimizer {
 
   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
                       MachineBasicBlock::iterator InsertBefore,
-                      AMDGPU::OpName OpName, Register DestReg) const;
+                      const DebugLoc &DL, AMDGPU::OpName OpName,
+                      Register DestReg) const;
   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                            MachineBasicBlock::iterator InsertBefore,
-                           AMDGPU::OpName OpName) const;
+                           const DebugLoc &DL, AMDGPU::OpName OpName) const;
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
@@ -1367,10 +1368,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
 // Paired.
 void SILoadStoreOptimizer::copyToDestRegs(
     CombineInfo &CI, CombineInfo &Paired,
-    MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName,
-    Register DestReg) const {
+    MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
+    AMDGPU::OpName OpName, Register DestReg) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1398,9 +1398,9 @@ void SILoadStoreOptimizer::copyToDestRegs(
 Register
 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                                       MachineBasicBlock::iterator InsertBefore,
+                                      const DebugLoc &DL,
                                       AMDGPU::OpName OpName) const {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
 
   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
 
@@ -1456,7 +1456,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
   Register DestReg = MRI->createVirtualRegister(SuperRC);
 
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1484,7 +1485,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1541,7 +1542,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   Register BaseReg = AddrReg->getReg();
   unsigned BaseSubReg = AddrReg->getSubReg();
@@ -1582,7 +1584,9 @@ MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
                                      MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1607,7 +1611,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1618,7 +1622,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
+
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
@@ -1639,7 +1645,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
   New.addImm(MergedOffset);
   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1650,7 +1656,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1680,7 +1688,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1691,7 +1699,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1731,7 +1741,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(0)            // swz
           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1742,12 +1752,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
@@ -1789,7 +1800,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
@@ -1807,7 +1820,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
        .addImm(CI.CPol)
        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
-  copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
+  copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1818,12 +1831,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
@@ -2094,12 +2109,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
-  DebugLoc DL = CI.I->getDebugLoc();
+  DebugLoc DL =
+      DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
 
   const unsigned Opcode = getNewOpcode(CI, Paired);
 
   Register SrcReg =
-      copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
+      copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
diff --git a/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll
new file mode 100644
index 0000000000000..0873003899e8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s
+
+@lds = addrspace(3) global [512 x float] poison, align 4
+
+define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
+; CHECK-LABEL: simple_write2_one_val_f32:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_sections .debug_frame
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    .file 1 "/" "<stdin>"
+; CHECK-NEXT:    .loc 1 1 1 prologue_end ; <stdin>:1:1
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:1 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:5 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:3 <- undef
+; CHECK-NEXT:    .loc 1 2 1 ; <stdin>:2:1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:4 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:2 <- undef
+; CHECK-NEXT:    .loc 1 3 1 ; <stdin>:3:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dword v1, v0, s[0:1]
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_write2_one_val_f32:6 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr0
+; CHECK-NEXT:    .loc 1 0 0 is_stmt 0 ; <stdin>:0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b32 v0, v1, v1 offset1:8
+; CHECK-NEXT:    .loc 1 9 1 is_stmt 1 ; <stdin>:9:1
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .Ltmp3:
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i
+  %val = load float, ptr addrspace(1) %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  store float %val, ptr addrspace(3) %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  store float %val, ptr addrspace(3) %arrayidx1, align 4
+  ret void
+}
+
+define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: simple_read2_f32:
+; CHECK:       .Lfunc_begin1:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    .loc 1 11 1 prologue_end ; <stdin>:11:1
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:8 <- $vgpr2
+; CHECK-NEXT:    .loc 1 0 0 is_stmt 0 ; <stdin>:0
+; CHECK-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:9 <- undef
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:11 <- [DW_OP_plus_uconst 32, DW_OP_stack_value] $vgpr2
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:12 <- undef
+; CHECK-NEXT:    .loc 1 10 1 is_stmt 1 ; <stdin>:10:1
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:7 <- undef
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:10 <- [DW_OP_plus_uconst 8, DW_OP_stack_value] undef
+; CHECK-NEXT:    .loc 1 16 1 ; <stdin>:16:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
+; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:13 <- $vgpr0
+; CHECK-NEXT:    ;DEBUG_VALUE: simple_read2_f32:14 <- undef
+; CHECK-NEXT:    .loc 1 18 1 ; <stdin>:18:1
+; CHECK-NEXT:    global_store_dword v2, v0, s[0:1]
+; CHECK-NEXT:    .loc 1 19 1 ; <stdin>:19:1
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .Ltmp8:
+  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
+  store float %sum, ptr addrspace(1) %out.gep, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }

From 46dc4b841daad98c8358ac873827b6de7ce104c5 Mon Sep 17 00:00:00 2001
From: GeorgeHuyubo <113479859+GeorgeHuyubo@users.noreply.github.com>
Date: Mon, 10 Nov 2025 15:21:06 -0800
Subject: [PATCH 31/97] [lldb] Fix TestLocationsAfterRebuild test (#167402)

After commit fce58897ce82 enabled the locate_module callback for main
executables, the TestLocationsAfterRebuild.py test started failing on
remote platforms. The test was hardcoded to expect breakpoint location
"1.3" to exist after three rebuilds, but the new module loading behavior
changed how breakpoint locations are managed.

This patch fixes the test by:
- Removing the @skipIfRemote decorator to re-enable testing on remote
platforms
- Dynamically querying the actual number of breakpoint locations instead
of assuming a specific count
- Using loc.GetID() to get actual location IDs rather than assuming
sequential IDs (1, 2, 3)
- Iterating through all valid locations and verifying each can be
disabled/enabled

The fix maintains the original test intent (validating that breakpoint
location IDs remain valid after rebuilds) while adapting to the new
module loading behavior where the number and IDs of locations may vary
across platforms.

Co-authored-by: George Hu <georgehuyubo@gmail.com>
---
 .../TestLocationsAfterRebuild.py              | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
index e772965f5bb85..bc53feaafa635 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py
@@ -7,7 +7,7 @@
 import lldb
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test.decorators import skipIfWindows, skipIfRemote
+from lldbsuite.test.decorators import skipIfWindows
 import os
 
 
@@ -19,7 +19,6 @@ class TestLocationsAfterRebuild(TestBase):
 
     # On Windows we cannot remove a file that lldb is debugging.
     @skipIfWindows
-    @skipIfRemote
     def test_remaining_location_spec(self):
         """If we rebuild a couple of times some of the old locations
         get removed.  Make sure the command-line breakpoint id
@@ -55,6 +54,24 @@ def test_remaining_location_spec(self):
             self, target, bkpt
         )
 
+        # After enabling locate_module callback for main executables,
+        # the number of locations may vary depending on the platform.
+        num_locs = bkpt.GetNumLocations()
         bkpt_id = bkpt.GetID()
-        loc_string = f"{bkpt_id}.3"
-        self.runCmd(f"break disable {loc_string}")
+
+        self.assertGreater(
+            num_locs,
+            0,
+            f"Expected at least one breakpoint location, but found {num_locs}",
+        )
+
+        # Iterate through all valid locations and verify we can disable each one.
+        # This tests that breakpoint location IDs remain valid after rebuilds.
+        for loc_idx in range(num_locs):
+            loc = bkpt.GetLocationAtIndex(loc_idx)
+            self.assertTrue(loc.IsValid(), f"Location at index {loc_idx} is not valid")
+
+            # Get the actual location ID from the location object
+            loc_id = loc.GetID()
+            loc_string = f"{bkpt_id}.{loc_id}"
+            self.runCmd(f"break disable {loc_string}")

From 97d50b5c8a7b9269953888d2fcf21e4d4a16dc25 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 10 Nov 2025 15:35:11 -0800
Subject: [PATCH 32/97] [lldb] Fix crash in BreakpointSite::BumpHitCounts
 (#166876)

Fix crash in `BreakpointSite::BumpHitCounts` due to missing
synchronization. When bumping the hit count, we were correctly acquiring
the constituents mutex, but didn't protect the breakpoint location
collection. This PR fixes the issue by making the iterator returned by
the collection a `LockingAdaptedIterable`.

rdar://163760832
---
 .../Breakpoint/BreakpointLocationCollection.h | 10 ++++++----
 .../BreakpointLocationCollection.cpp          | 20 ++++++++++---------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
index 124cb55eaf723..57acb82dd96e9 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
@@ -32,7 +32,8 @@ class BreakpointLocationCollection {
 
   ~BreakpointLocationCollection();
 
-  BreakpointLocationCollection &operator=(const BreakpointLocationCollection &rhs);
+  BreakpointLocationCollection &
+  operator=(const BreakpointLocationCollection &rhs);
 
   /// Add the breakpoint \a bp_loc_sp to the list.
   ///
@@ -172,17 +173,18 @@ class BreakpointLocationCollection {
                          lldb::break_id_t break_loc_id) const;
 
   collection m_break_loc_collection;
-  mutable std::mutex m_collection_mutex;
+  mutable std::recursive_mutex m_collection_mutex;
   /// These are used if we're preserving breakpoints in this list:
   const bool m_preserving_bkpts = false;
   std::map<std::pair<lldb::break_id_t, lldb::break_id_t>, lldb::BreakpointSP>
       m_preserved_bps;
 
 public:
-  typedef llvm::iterator_range<collection::const_iterator>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       BreakpointLocationCollectionIterable;
   BreakpointLocationCollectionIterable BreakpointLocations() {
-    return BreakpointLocationCollectionIterable(m_break_loc_collection);
+    return BreakpointLocationCollectionIterable(m_break_loc_collection,
+                                                m_collection_mutex);
   }
 };
 } // namespace lldb_private
diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
index 97715836ec104..adff4299a5289 100644
--- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
@@ -24,7 +24,7 @@ BreakpointLocationCollection::BreakpointLocationCollection(bool preserving)
 BreakpointLocationCollection::~BreakpointLocationCollection() = default;
 
 void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP old_bp_loc =
       FindByIDPair(bp_loc->GetBreakpoint().GetID(), bp_loc->GetID());
   if (!old_bp_loc.get()) {
@@ -44,7 +44,7 @@ void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) {
 
 bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id,
                                           lldb::break_id_t bp_loc_id) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos = GetIDPairIterator(bp_id, bp_loc_id); // Predicate
   if (pos != m_break_loc_collection.end()) {
     if (m_preserving_bkpts) {
@@ -117,7 +117,7 @@ const BreakpointLocationSP BreakpointLocationCollection::FindByIDPair(
 }
 
 BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP stop_sp;
   if (i < m_break_loc_collection.size())
     stop_sp = m_break_loc_collection[i];
@@ -127,7 +127,7 @@ BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) {
 
 const BreakpointLocationSP
 BreakpointLocationCollection::GetByIndex(size_t i) const {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   BreakpointLocationSP stop_sp;
   if (i < m_break_loc_collection.size())
     stop_sp = m_break_loc_collection[i];
@@ -168,7 +168,7 @@ bool BreakpointLocationCollection::ShouldStop(
 }
 
 bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos, begin = m_break_loc_collection.begin(),
                             end = m_break_loc_collection.end();
 
@@ -180,7 +180,7 @@ bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) {
 }
 
 bool BreakpointLocationCollection::IsInternal() const {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::const_iterator pos, begin = m_break_loc_collection.begin(),
                                   end = m_break_loc_collection.end();
 
@@ -197,7 +197,7 @@ bool BreakpointLocationCollection::IsInternal() const {
 
 void BreakpointLocationCollection::GetDescription(
     Stream *s, lldb::DescriptionLevel level) {
-  std::lock_guard<std::mutex> guard(m_collection_mutex);
+  std::lock_guard<std::recursive_mutex> guard(m_collection_mutex);
   collection::iterator pos, begin = m_break_loc_collection.begin(),
                             end = m_break_loc_collection.end();
 
@@ -212,8 +212,10 @@ BreakpointLocationCollection &BreakpointLocationCollection::operator=(
     const BreakpointLocationCollection &rhs) {
   if (this != &rhs) {
       std::lock(m_collection_mutex, rhs.m_collection_mutex);
-      std::lock_guard<std::mutex> lhs_guard(m_collection_mutex, std::adopt_lock);
-      std::lock_guard<std::mutex> rhs_guard(rhs.m_collection_mutex, std::adopt_lock);
+      std::lock_guard<std::recursive_mutex> lhs_guard(m_collection_mutex,
+                                                      std::adopt_lock);
+      std::lock_guard<std::recursive_mutex> rhs_guard(rhs.m_collection_mutex,
+                                                      std::adopt_lock);
       m_break_loc_collection = rhs.m_break_loc_collection;
   }
   return *this;

From 6537e0dba793e971181bb413e61c92e93475daf9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 15:37:41 -0800
Subject: [PATCH 33/97] ARM: Remove TRI argument from AddDReg (#158228)

---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 86 ++++++++++++------------
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h   |  3 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp  |  8 +--
 3 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index b466ca6f19b3e..32f696c7c9b56 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -929,15 +929,15 @@ ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI,
   return TargetInstrInfo::describeLoadedValue(MI, Reg);
 }
 
-const MachineInstrBuilder &
-ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                          unsigned SubIdx, unsigned State,
-                          const TargetRegisterInfo *TRI) const {
+const MachineInstrBuilder &ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB,
+                                                     unsigned Reg,
+                                                     unsigned SubIdx,
+                                                     unsigned State) const {
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
   if (Register::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+    return MIB.addReg(getRegisterInfo().getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
 
@@ -1011,8 +1011,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
         if (Subtarget.hasV5TEOps()) {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
           MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
              .add(predOps(ARMCC::AL));
         } else {
@@ -1022,8 +1022,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .addMemOperand(MMO)
                                         .add(predOps(ARMCC::AL));
-          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1073,9 +1073,9 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1105,10 +1105,10 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         .addFrameIndex(FI)
                                         .add(predOps(ARMCC::AL))
                                         .addMemOperand(MMO);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -1125,14 +1125,14 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       .addFrameIndex(FI)
                                       .add(predOps(ARMCC::AL))
                                       .addMemOperand(MMO);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
-        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
-              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill));
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0);
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -1272,8 +1272,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
 
       if (Subtarget.hasV5TEOps()) {
         MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
-        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
         MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
            .add(predOps(ARMCC::AL));
       } else {
@@ -1283,8 +1283,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                   .addFrameIndex(FI)
                   .addMemOperand(MMO)
                   .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
       }
 
       if (DestReg.isPhysical())
@@ -1330,9 +1330,9 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                       .addFrameIndex(FI)
                                       .addMemOperand(MMO)
                                       .add(predOps(ARMCC::AL));
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
         if (DestReg.isPhysical())
           MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
@@ -1359,10 +1359,10 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                        .addFrameIndex(FI)
                                        .add(predOps(ARMCC::AL))
                                        .addMemOperand(MMO);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
          if (DestReg.isPhysical())
            MIB.addReg(DestReg, RegState::ImplicitDefine);
        }
@@ -1380,14 +1380,14 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
                                     .addFrameIndex(FI)
                                     .add(predOps(ARMCC::AL))
                                     .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead);
       if (DestReg.isPhysical())
         MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 27f8e3b3170e7..c817e94dd9c76 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -240,8 +240,7 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
             const MachineInstr &Orig) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
-                                     unsigned SubIdx, unsigned State,
-                                     const TargetRegisterInfo *TRI) const;
+                                     unsigned SubIdx, unsigned State) const;
 
   bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
                         const MachineRegisterInfo *MRI) const override;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index b66e4075d4cfa..6e7d8f75b1aef 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -197,8 +197,8 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
-    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
-    AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill));
+    AddDReg(MIB, SrcReg, ARM::gsub_1, 0);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
     return;
   }
@@ -238,8 +238,8 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     }
 
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
-    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
-    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead);
+    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead);
     MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
     if (DestReg.isPhysical())

From 55422e804b3bd2fcb1a330673af40240e359540f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 15:43:55 -0800
Subject: [PATCH 34/97] CodeGen: Remove TRI argument from getRegClass (#158225)

TargetInstrInfo now directly holds a reference to TargetRegisterInfo
and does not need TRI passed in anywhere.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  5 ++--
 llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp |  4 +--
 llvm/lib/CodeGen/BreakFalseDeps.cpp           |  2 +-
 llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp   |  4 +--
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         |  2 +-
 llvm/lib/CodeGen/InitUndef.cpp                |  2 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  2 +-
 llvm/lib/CodeGen/MachineLICM.cpp              |  2 +-
 llvm/lib/CodeGen/MachineVerifier.cpp          | 11 +++----
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |  2 +-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    |  2 +-
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  9 +++---
 .../SelectionDAG/ScheduleDAGRRList.cpp        |  2 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  5 ++--
 .../lib/CodeGen/TwoAddressInstructionPass.cpp |  2 +-
 .../AArch64/AArch64ConditionalCompares.cpp    | 10 +++----
 .../AArch64DeadRegisterDefinitionsPass.cpp    |  2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  5 +---
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 12 ++++----
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  4 +--
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 11 ++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  5 ++--
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  6 ++--
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     |  2 +-
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp   |  5 ++--
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |  3 +-
 llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp |  8 ++---
 llvm/lib/Target/ARM/MLxExpansionPass.cpp      |  2 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp       |  2 +-
 .../lib/Target/Hexagon/HexagonBitSimplify.cpp |  2 +-
 .../Target/Hexagon/HexagonFrameLowering.cpp   |  4 +--
 .../Hexagon/HexagonLoadStoreWidening.cpp      |  4 +--
 .../Target/Hexagon/HexagonVLIWPacketizer.cpp  |  4 +--
 .../LoongArchDeadRegisterDefinitions.cpp      |  3 +-
 llvm/lib/Target/Mips/MipsSEInstrInfo.cpp      |  4 +--
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp   |  4 +--
 .../RISCV/RISCVDeadRegisterDefinitions.cpp    |  3 +-
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp |  4 +--
 .../SystemZ/SystemZHazardRecognizer.cpp       |  3 +-
 .../X86/X86AvoidStoreForwardingBlocks.cpp     |  6 ++--
 llvm/lib/Target/X86/X86DomainReassignment.cpp |  4 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 30 +++++++++----------
 llvm/lib/Target/X86/X86InstrInfo.h            |  5 ++--
 llvm/lib/Target/X86/X86OptimizeLEAs.cpp       |  2 +-
 .../X86/X86SpeculativeLoadHardening.cpp       |  2 +-
 46 files changed, 98 insertions(+), 121 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 02cd9256c422a..2a0b1a6b183bc 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -159,9 +159,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
 
   /// Given a machine instruction descriptor, returns the register
   /// class constraint for OpNum, or NULL.
-  virtual const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const;
+  virtual const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID,
+                                                 unsigned OpNum) const;
 
   /// Returns true if MI is an instruction we are unable to reason about
   /// (like a call or something with unmodeled side effects).
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 6567bd403c857..46b5bb7908227 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -395,7 +395,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI);
+      RC = TII->getRegClass(MI.getDesc(), i);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
@@ -479,7 +479,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     // Note register reference...
     const TargetRegisterClass *RC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      RC = TII->getRegClass(MI.getDesc(), i, TRI);
+      RC = TII->getRegClass(MI.getDesc(), i);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
     RegRefs.emplace(Reg.asMCReg(), RR);
   }
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 1846880b0c181..fead3ee250841 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -133,7 +133,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   }
 
   // Get the undef operand's register class
-  const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx, TRI);
+  const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx);
   assert(OpRC && "Not a valid register class");
 
   // If the instruction has a true dependency, we can hide the false depdency
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 86377cff2d29d..3259a3e83c541 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
     const TargetRegisterClass *NewRC = nullptr;
 
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI.getDesc(), i);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
@@ -316,7 +316,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
 
     const TargetRegisterClass *NewRC = nullptr;
     if (i < MI.getDesc().getNumOperands())
-      NewRC = TII->getRegClass(MI.getDesc(), i, TRI);
+      NewRC = TII->getRegClass(MI.getDesc(), i);
 
     // For now, only allow the register to be changed if its register
     // class is consistent across all uses.
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 5fab6ec506e94..e8954a3d9899b 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -114,7 +114,7 @@ Register llvm::constrainOperandRegClass(
   // Assume physical registers are properly constrained.
   assert(Reg.isVirtual() && "PhysReg not implemented");
 
-  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI);
+  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx);
   // Some of the target independent instructions, like COPY, may not impose any
   // register class constraints on some of their operands: If it's a use, we can
   // skip constraining as the instruction defining the register would constrain
diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
index e07e598019709..12b36f56d4d9a 100644
--- a/llvm/lib/CodeGen/InitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -232,7 +232,7 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
       MachineOperand &UseMO = MI.getOperand(UseOpIdx);
       if (UseMO.getReg() == MCRegister::NoRegister) {
         const TargetRegisterClass *RC =
-            TII->getRegClass(MI.getDesc(), UseOpIdx, TRI);
+            TII->getRegClass(MI.getDesc(), UseOpIdx);
         Register NewDest = MRI->createVirtualRegister(RC);
         // We don't have a way to update dead lanes, so keep track of the
         // new register so that we avoid querying it later.
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 37e5c517d24d8..eb46124d9eb5f 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -978,7 +978,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
   assert(getMF() && "Can't have an MF reference here!");
   // Most opcodes have fixed constraints in their MCInstrDesc.
   if (!isInlineAsm())
-    return TII->getRegClass(getDesc(), OpIdx, TRI);
+    return TII->getRegClass(getDesc(), OpIdx);
 
   if (!getOperand(OpIdx).isReg())
     return nullptr;
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 729e73c8c312c..c169467384f8b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1399,7 +1399,7 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI,
   if (NewOpc == 0) return nullptr;
   const MCInstrDesc &MID = TII->get(NewOpc);
   MachineFunction &MF = *MI->getMF();
-  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI);
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
   Register Reg = MRI->createVirtualRegister(RC);
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index fdf10480b6e05..013f52938b65c 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2657,8 +2657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         return;
       }
       if (MONum < MCID.getNumOperands()) {
-        if (const TargetRegisterClass *DRC =
-                TII->getRegClass(MCID, MONum, TRI)) {
+        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             OS << printReg(Reg, TRI) << " is not a "
@@ -2742,12 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         // has register class constraint, the virtual register must
         // comply to it.
         if (!isPreISelGenericOpcode(MCID.getOpcode()) &&
-            MONum < MCID.getNumOperands() &&
-            TII->getRegClass(MCID, MONum, TRI)) {
+            MONum < MCID.getNumOperands() && TII->getRegClass(MCID, MONum)) {
           report("Virtual register does not match instruction constraint", MO,
                  MONum);
           OS << "Expect register class "
-             << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI))
+             << TRI->getRegClassName(TII->getRegClass(MCID, MONum))
              << " but got nothing\n";
           return;
         }
@@ -2773,8 +2771,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         }
       }
       if (MONum < MCID.getNumOperands()) {
-        if (const TargetRegisterClass *DRC =
-                TII->getRegClass(MCID, MONum, TRI)) {
+        if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) {
           if (SubIdx) {
             const TargetRegisterClass *SuperRC =
                 TRI->getLargestLegalSuperClass(RC, *MF);
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index f93a7f22c3961..005e44fc7080b 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1374,7 +1374,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
   }
 
   const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
-  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI);
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0);
   if (!DefMI->isImplicitDef()) {
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 507b2d61a534c..5c84059da273b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1965,7 +1965,7 @@ Register FastISel::createResultReg(const TargetRegisterClass *RC) {
 Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
                                             unsigned OpNum) {
   if (Op.isVirtual()) {
-    const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum, &TRI);
+    const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum);
     if (!MRI.constrainRegClass(Op, RegClass)) {
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index d84c3fb05bb24..72d0c44889048 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -125,7 +125,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
           const TargetRegisterClass *RC = nullptr;
           if (i + II.getNumDefs() < II.getNumOperands()) {
             RC = TRI->getAllocatableClass(
-                TII->getRegClass(II, i + II.getNumDefs(), TRI));
+                TII->getRegClass(II, i + II.getNumDefs()));
           }
           if (!UseRC)
             UseRC = RC;
@@ -197,7 +197,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     // register instead of creating a new vreg.
     Register VRBase;
     const TargetRegisterClass *RC =
-        TRI->getAllocatableClass(TII->getRegClass(II, i, TRI));
+        TRI->getAllocatableClass(TII->getRegClass(II, i));
     // Always let the value type influence the used register class. The
     // constraints on the instruction may be too lax to represent the value
     // type correctly. For example, a 64-bit float (X86::FR64) can't live in
@@ -330,7 +330,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
   if (II) {
     const TargetRegisterClass *OpRC = nullptr;
     if (IIOpNum < II->getNumOperands())
-      OpRC = TII->getRegClass(*II, IIOpNum, TRI);
+      OpRC = TII->getRegClass(*II, IIOpNum);
 
     if (OpRC) {
       unsigned MinNumRegs = MinRCSize;
@@ -409,8 +409,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op,
     Register VReg = R->getReg();
     MVT OpVT = Op.getSimpleValueType();
     const TargetRegisterClass *IIRC =
-        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI))
-           : nullptr;
+        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum)) : nullptr;
     const TargetRegisterClass *OpRC =
         TLI->isTypeLegal(OpVT)
             ? TLI->getRegClassFor(OpVT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index f70b6cddcc099..12fc26d949581 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -340,7 +340,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Idx = RegDefPos.GetIdx();
     const MCInstrDesc &Desc = TII->get(Opcode);
-    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx);
     assert(RC && "Not a valid register class");
     RegClass = RC->getID();
     // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 7c89e51ff86d3..6c09d6c650456 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -58,9 +58,8 @@ static cl::opt<unsigned int> MaxAccumulatorWidth(
 
 TargetInstrInfo::~TargetInstrInfo() = default;
 
-const TargetRegisterClass *
-TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                             const TargetRegisterInfo * /*RemoveMe*/) const {
+const TargetRegisterClass *TargetInstrInfo::getRegClass(const MCInstrDesc &MCID,
+                                                        unsigned OpNum) const {
   if (OpNum >= MCID.getNumOperands())
     return nullptr;
 
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index b99e1c7f19b71..3f2961cd83bab 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1402,7 +1402,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
         // Unfold the load.
         LLVM_DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC = TRI->getAllocatableClass(
-            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI));
+            TII->getRegClass(UnfoldMCID, LoadRegIndex));
         Register Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index cb831963759b5..7712d2a1d88d8 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -629,8 +629,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     }
     const MCInstrDesc &MCID = TII->get(Opc);
     // Create a dummy virtual register for the SUBS def.
-    Register DestReg =
-        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI));
+    Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0));
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
         .addReg(DestReg, RegState::Define | RegState::Dead)
@@ -638,8 +637,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
         .addImm(0)
         .addImm(0);
     // SUBS uses the GPR*sp register classes.
-    MRI->constrainRegClass(HeadCond[2].getReg(),
-                           TII->getRegClass(MCID, 1, TRI));
+    MRI->constrainRegClass(HeadCond[2].getReg(), TII->getRegClass(MCID, 1));
   }
 
   Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
@@ -686,10 +684,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
   const MCInstrDesc &MCID = TII->get(Opc);
   MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
-                         TII->getRegClass(MCID, 0, TRI));
+                         TII->getRegClass(MCID, 0));
   if (CmpMI->getOperand(FirstOp + 1).isReg())
     MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
-                           TII->getRegClass(MCID, 1, TRI));
+                           TII->getRegClass(MCID, 1));
   MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
                                 .add(CmpMI->getOperand(FirstOp)); // Register Rn
   if (isZBranch)
diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 75361f5d313c6..4ff49a627c794 100644
--- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -156,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         LLVM_DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
         continue;
       }
-      const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
       unsigned NewReg;
       if (RC == nullptr) {
         LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 66e49491b18e0..f72a6f0140b48 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -11063,8 +11063,6 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
                            MachineBasicBlock::iterator InsertTo) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI =
-      MBB.getParent()->getSubtarget().getRegisterInfo();
   MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
   Register Result = 0;
   for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
@@ -11073,8 +11071,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
           MRI.getRegClass(NewMI->getOperand(0).getReg()));
       NewMI->getOperand(I).setReg(Result);
     } else if (I == ReplaceOprNum) {
-      MRI.constrainRegClass(ReplaceReg,
-                            TII->getRegClass(NewMI->getDesc(), I, TRI));
+      MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
       NewMI->getOperand(I).setReg(ReplaceReg);
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 04e76c7abd202..d25db89cca358 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -595,17 +595,17 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
 
   // Determine register classes for destinations and register operands
   const TargetRegisterClass *FirstInstrDstRC =
-      TII->getRegClass(TII->get(Opcode.first), 0, TRI);
+      TII->getRegClass(TII->get(Opcode.first), 0);
   const TargetRegisterClass *FirstInstrOperandRC =
-      TII->getRegClass(TII->get(Opcode.first), 1, TRI);
+      TII->getRegClass(TII->get(Opcode.first), 1);
   const TargetRegisterClass *SecondInstrDstRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrDstRC
-          : TII->getRegClass(TII->get(Opcode.second), 0, TRI);
+          : TII->getRegClass(TII->get(Opcode.second), 0);
   const TargetRegisterClass *SecondInstrOperandRC =
       (Opcode.first == Opcode.second)
           ? FirstInstrOperandRC
-          : TII->getRegClass(TII->get(Opcode.second), 1, TRI);
+          : TII->getRegClass(TII->get(Opcode.second), 1);
 
   // Get old registers destinations and new register destinations
   Register DstReg = MI.getOperand(0).getReg();
@@ -784,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) {
   }
 
   const TargetRegisterClass *DstRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI);
+      TII->getRegClass(TII->get(MI.getOpcode()), 0);
   const TargetRegisterClass *DstRC32 =
       TRI->getSubRegisterClass(DstRC64, AArch64::sub_32);
   assert(DstRC32 && "Destination register class of UBFMXri doesn't have a "
                     "sub_32 subregister class");
 
   const TargetRegisterClass *SrcRC64 =
-      TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI);
+      TII->getRegClass(TII->get(MI.getOpcode()), 1);
   const TargetRegisterClass *SrcRC32 =
       TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32);
   assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 "
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index eaf8723094797..f3cf222038072 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -897,7 +897,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0));
   unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 964309b2e4b1e..293005c759e53 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -713,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
 
   // Verify the register is compatible with the operand.
   if (const TargetRegisterClass *OpRC =
-          TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
+          TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
     const TargetRegisterClass *NewRC =
         TRI->getRegClassForReg(*MRI, New->getReg());
 
@@ -2394,7 +2394,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
 
   unsigned OpIdx = Op - &UseMI->getOperand(0);
   const MCInstrDesc &InstDesc = UseMI->getDesc();
-  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
+  const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
   if (!OpRC || !TRI->isVectorSuperClass(OpRC))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6b397e0834686..89f490356aa26 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2570,7 +2570,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const MCInstrDesc &TID = get(NewOpcode);
     const TargetRegisterClass *NewRC =
-        RI.getAllocatableClass(getRegClass(TID, 0, &RI));
+        RI.getAllocatableClass(getRegClass(TID, 0));
     MRI.setRegClass(DestReg, NewRC);
 
     UseMO->setReg(DestReg);
@@ -3613,7 +3613,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
           AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
       const MCInstrDesc &MovDesc = get(MovOp);
 
-      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI);
+      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
       if (Is16Bit) {
         // We just need to find a correctly sized register class, so the
         // subregister index compatibility doesn't matter since we're statically
@@ -6028,9 +6028,8 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const {
 // FIXME: This should not be an overridable function. All subtarget dependent
 // operand modifications should go through isLookupRegClassByHwMode in the
 // generic handling.
-const TargetRegisterClass *
-SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-                         const TargetRegisterInfo *TRI) const {
+const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
+                                                    unsigned OpNum) const {
   if (OpNum >= TID.getNumOperands())
     return nullptr;
   const MCOperandInfo &OpInfo = TID.operands()[OpNum];
@@ -6805,7 +6804,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
     return;
 
   const TargetRegisterClass *DeclaredRC =
-      getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI);
+      getRegClass(MI.getDesc(), SAddr->getOperandNo());
 
   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
   SAddr->setReg(ToSGPR);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 8d693b1b19dcd..6966e45027627 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1622,9 +1622,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// Return true if this opcode should not be used by codegen.
   bool isAsmOnlyOpcode(int MCOp) const;
 
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &TID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
+  const TargetRegisterClass *getRegClass(const MCInstrDesc &TID,
+                                         unsigned OpNum) const override;
 
   void fixImplicitOperands(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 00aae2c910608..fcf91e0cf0a7c 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1337,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
     int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
                                               AMDGPU::OpName::data1);
 
-    const TargetRegisterClass *DataRC0 =
-        TII->getRegClass(Write2Opc, Data0Idx, TRI);
+    const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
 
-    const TargetRegisterClass *DataRC1 =
-        TII->getRegClass(Write2Opc, Data1Idx, TRI);
+    const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
 
     if (unsigned SubReg = Data0->getSubReg()) {
       DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index caff354c73510..86ca22cfeffd8 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1346,7 +1346,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
       continue;
 
     unsigned I = Op.getOperandNo();
-    const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I, TRI);
+    const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I);
     if (!OpRC || !TRI->isVSSuperClass(OpRC))
       continue;
 
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ce1cdb35116cc..80921ce4fb4dd 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -708,7 +708,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0));
 
   MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -881,8 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg();
 
   const MCInstrDesc &MCID = MI.getDesc();
-  const TargetRegisterClass *RegClass =
-      TII.getRegClass(MCID, FIOperandNum, this);
+  const TargetRegisterClass *RegClass = TII.getRegClass(MCID, FIOperandNum);
 
   if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg)))
     // Must be addrmode4/6.
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 138981ad92a87..21a113572ce93 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2342,7 +2342,6 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMBaseInstrInfo &TII =
       *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned Limit = (1 << 12) - 1;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
@@ -2364,7 +2363,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
           break;
 
         const MCInstrDesc &MCID = MI.getDesc();
-        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI);
+        const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i);
         if (RegClass && !RegClass->contains(ARM::SP))
           HasNonSPFrameIndex = true;
 
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index cd4299b7a1a53..db37b769efcad 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
-          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
           MRI->constrainRegClass(FirstReg, TRC);
           MRI->constrainRegClass(SecondReg, TRC);
 
@@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg,
   MachineFunction *MF = MI->getMF();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const MCInstrDesc &MCID = TII->get(MI->getOpcode());
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp);
   MRI.constrainRegClass(NewBaseReg, TRC);
 
   int OldOffset = MI->getOperand(BaseOp + 1).getImm();
@@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
 
   const MCInstrDesc &MCID = TII->get(NewOpcode);
   // Constrain the def register class
-  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI);
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0);
   MRI.constrainRegClass(NewReg, TRC);
   // And do the same for the base operand
-  TRC = TII->getRegClass(MCID, 2, TRI);
+  TRC = TII->getRegClass(MCID, 2);
   MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
 
   unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask);
diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
index 8e1bf1d957400..eb237b4275cc9 100644
--- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -283,7 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
 
   const MCInstrDesc &MCID1 = TII->get(MulOpc);
   const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
-  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI));
+  Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0));
 
   MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
     .addReg(Src1Reg, getKillRegState(Src1Kill))
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 6e7d8f75b1aef..032acf559ffc3 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -564,7 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   bool isSub = false;
 
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI);
+  const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx);
 
   // Memory operands in inline assembly always use AddrModeT2_i12.
   if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR)
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 68f53124f9db8..b378ce47fd1bd 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1886,7 +1886,7 @@ bool BitSimplification::matchHalf(unsigned SelfR,
 
 bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
       unsigned OpNum) {
-  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI);
+  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum);
   auto *RRC = HBS::getFinalVRegClass(R, MRI);
   return OpRC->hasSubClassEq(RRC);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index dd343d9fbe79f..a90255d1f62e6 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -2225,7 +2225,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (!Bad) {
           // If the addressing mode is ok, check the register class.
           unsigned OpNum = Load ? 0 : 2;
-          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI);
+          auto *RC = HII.getRegClass(In.getDesc(), OpNum);
           RC = getCommonRC(SI.RC, RC);
           if (RC == nullptr)
             Bad = true;
@@ -2395,7 +2395,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
 
         HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
                                                   SrcOp.getSubReg() };
-        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI);
+        auto *RC = HII.getRegClass(SI.getDesc(), 2);
         // The this-> is needed to unconfuse MSVC.
         Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
         LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
index 7cbd81ff227e1..54969b2317ef4 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp
@@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineInstr *CombI;
     if (Acc != 0) {
       const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+      const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
       Register VReg = MF->getRegInfo().createVirtualRegister(RC);
       MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc);
       NG.push_back(TfrI);
@@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   } else {
     // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
     const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
-    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0);
     Register VReg = MF->getRegInfo().createVirtualRegister(RC);
     MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc));
     NG.push_back(TfrI);
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index cb88d1ac4af9f..d39b79a86753a 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   const MCInstrDesc& MCID = PacketMI.getDesc();
 
   // First operand is always the result.
-  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0);
   // Double regs can not feed into new value store: PRM section: 5.4.2.2.
   if (PacketRC == &Hexagon::DoubleRegsRegClass)
     return false;
@@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
-  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI);
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0);
   if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass)
     return false;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
index 0ccebeb393267..6358e348fe424 100644
--- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
@@ -60,7 +60,6 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   LLVM_DEBUG(dbgs() << "***** LoongArchDeadRegisterDefinitions *****\n");
 
@@ -86,7 +85,7 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction(
           continue;
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
         if (!(RC && RC->contains(LoongArch::R0))) {
           LLVM_DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
           continue;
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index 517f489984c27..b3aa5ac1ac1b5 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -678,8 +678,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   const MCInstrDesc &Desc = get(Opc);
   assert(Desc.NumOperands == 2 && "Unary instruction expected.");
   const MipsRegisterInfo *RI = &getRegisterInfo();
-  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI));
-  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI));
+  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0));
+  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1));
 
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 85b40727ff296..b3a7c829958ec 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -2023,7 +2023,7 @@ Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = getPointerRegClass();
   Register BaseReg = MRI.createVirtualRegister(RC);
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
@@ -2051,7 +2051,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   const MCInstrDesc &MCID = MI.getDesc();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum, this));
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum));
 }
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 51180f548ca6d..5d3d9b5c4cf03 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -59,7 +59,6 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n");
 
@@ -89,7 +88,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << "    Dead def operand #" << I << " in:\n      ";
                    MI.print(dbgs()));
         Register X0Reg;
-        const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI);
+        const TargetRegisterClass *RC = TII->getRegClass(Desc, I);
         if (RC && RC->contains(RISCV::X0)) {
           X0Reg = RISCV::X0;
         } else if (RC && RC->contains(RISCV::X0_W)) {
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index fdf9a4fe32fe6..e1ff243bb1a47 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -455,7 +455,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
     True->getOperand(1).setReg(MI.getOperand(2).getReg());
     // If True is masked then its passthru needs to be in VRNoV0.
     MRI->constrainRegClass(True->getOperand(1).getReg(),
-                           TII->getRegClass(True->getDesc(), 1, TRI));
+                           TII->getRegClass(True->getDesc(), 1));
   }
 
   MI.setDesc(TII->get(NewOpc));
@@ -675,7 +675,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     if (Passthru.getReg().isValid())
       MRI->constrainRegClass(
           Passthru.getReg(),
-          TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI));
+          TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo()));
   }
 
   if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) {
diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index 5313fba3bed1d..8fc339f59e60a 100644
--- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -115,11 +115,10 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
 }
 
 bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
-  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
   const MCInstrDesc &MID = MI->getDesc();
   unsigned Count = 0;
   for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
-    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI);
+    const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx);
     if (RC == nullptr)
       continue;
     if (OpIdx >= MID.getNumDefs() &&
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index d2e35277419f7..9473e8db3af93 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
   MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
   MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
 
-  Register Reg1 = MRI->createVirtualRegister(
-      TII->getRegClass(TII->get(NLoadOpcode), 0, TRI));
+  Register Reg1 =
+      MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0));
   MachineInstr *NewLoad =
       BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
               Reg1)
@@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
 }
 
 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
-  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI);
+  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0);
   return TRI->getRegSizeInBits(*TRC) / 8;
 }
 
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 5d190114615de..2047a53199dd6 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -174,8 +174,8 @@ class InstrReplacerDstCOPY : public InstrConverterBase {
     MachineBasicBlock *MBB = MI->getParent();
     const DebugLoc &DL = MI->getDebugLoc();
 
-    Register Reg = MRI->createVirtualRegister(
-        TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo()));
+    Register Reg =
+        MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0));
     MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
       Bld.add(MO);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2c6d1af225b6b..86ce7645eb91c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
                       X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
       Subtarget(STI), RI(STI.getTargetTriple()) {}
 
-const TargetRegisterClass *
-X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-                          const TargetRegisterInfo *TRI) const {
-  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI);
+const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID,
+                                                     unsigned OpNum) const {
+  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
   // If the target does not have egpr, then r16-r31 will be resereved for all
   // instructions.
   if (!RC || !Subtarget.hasEGPR())
@@ -7235,7 +7234,6 @@ static void updateOperandRegConstraints(MachineFunction &MF,
                                         MachineInstr &NewMI,
                                         const TargetInstrInfo &TII) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
 
   for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
     MachineOperand &MO = NewMI.getOperand(Idx);
@@ -7247,7 +7245,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
       continue;
 
     auto *NewRC =
-        MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI));
+        MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
     if (!NewRC) {
       LLVM_DEBUG(
           dbgs() << "WARNING: Unable to update register constraint for operand "
@@ -7345,7 +7343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned SrcIdx = (Imm >> 6) & 3;
 
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
           (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
@@ -7370,7 +7368,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
@@ -7389,7 +7387,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // table twice.
     if (OpNum == 2) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
         MachineInstr *NewMI =
@@ -7524,7 +7522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     bool NarrowToMOV32rm = false;
     if (Size) {
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       // Check if it's safe to fold the load. If the size of the object is
       // narrower than the load width, then it's not.
@@ -8118,9 +8116,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
              RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
     };
 
-    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
+    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
       MaskReg = Op1.getReg();
-    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
+    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
       MaskReg = Op2.getReg();
 
     if (MaskReg) {
@@ -8524,7 +8522,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   const MCInstrDesc &MCID = get(Opc);
 
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index);
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
@@ -8635,7 +8633,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
   // Emit the store instruction.
   if (UnfoldStore) {
-    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
     auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
     unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
     bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
@@ -8667,7 +8665,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
@@ -8718,7 +8716,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
-    DstRC = getRegClass(MCID, 0, &RI);
+    DstRC = getRegClass(MCID, 0);
     VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..ec7db70e51aaf 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -246,9 +246,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// GR*RegClass (definition in TD file)
   /// ->
   /// GR*_NOREX2RegClass (Returned register class)
-  const TargetRegisterClass *
-  getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
-              const TargetRegisterInfo *TRI) const override;
+  const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID,
+                                         unsigned OpNum) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 167bed132cd12..c9646053afac1 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA(
     // example MOV8mr_NOREX. We could constrain the register class of the LEA
     // def to suit MI, however since this case is very rare and hard to
     // reproduce in a test it's just more reliable to skip the LEA.
-    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) !=
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) !=
         MRI->getRegClass(DefMI->getOperand(0).getReg()))
       continue;
 
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index e0b3b61e29175..d0d897e6784d3 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -841,7 +841,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) {
   unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
       Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
   const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
-  return TII.getRegClass(MCID, Index, &TII.getRegisterInfo());
+  return TII.getRegClass(MCID, Index);
 }
 
 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(

From a78f587a255fd1600377423d7b1f76a2fa03c02d Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 10 Nov 2025 15:57:46 -0800
Subject: [PATCH 35/97] [DirectX] Fix dxc-compatible DataLayout in
 BitcodeWriter (#167410)

In #163587 we attempted to generate a dxc-compatible DataLayout when
emitting DXIL, but somehow got it wrong. Update using the DataLayout
that dxc emits verbatim.

Fixes #163882
---
 llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 26a8728e1f37c..48a9085820471 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1169,8 +1169,8 @@ void DXILBitcodeWriter::writeModuleInfo() {
   // We need to hardcode a triple and datalayout that's compatible with the
   // historical DXIL triple and datalayout from DXC.
   StringRef Triple = "dxil-ms-dx";
-  StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-"
-                 "f16:32-f32:32-f64:64-n8:16:32:64";
+  StringRef DL = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-"
+                 "f16:16-f32:32-f64:64-n8:16:32:64";
   writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/);
   writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
 

From cd6c761b53435cfd1a1ead67b461fc07819602e6 Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Mon, 10 Nov 2025 16:11:25 -0800
Subject: [PATCH 36/97] [LLDB] Fix lldb failure caused by 95db31e7f6. (#167423)

---
 lldb/test/API/functionalities/thread/step_until/function.list | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
index 5900fe8c35069..43c121eff9631 100644
--- a/lldb/test/API/functionalities/thread/step_until/function.list
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -1 +1,3 @@
-!call_me
+v1
+f call_me
+c 0

From 8751f26a1bfac82703aaa36c61edf8771ef9e77f Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Mon, 10 Nov 2025 19:15:34 -0500
Subject: [PATCH 37/97] [libc] add an SVE implementation of strlen (#167259)

This PR creates an SVE-based implementation for strlen by translating
from the AOR code in tree. Microbenchmark shows improvements against
NEON when N>=64. Although both implementations fall behind glibc by a
large margin,
this may be a good start point to explore SVE implementations.

Together with the PR:

1. Added two more tests of strlen with special nul symbols.
2. Added strlen's fuzzer and fix a typo in previous heap fuzzer.

```
=== strlen(16 bytes) ===
libc: 1.56115 ns/call, 9.54499 GiB/s
neon: 1.59393 ns/call, 9.34867 GiB/s
sve: 1.66097 ns/call, 8.97134 GiB/s

=== strlen(64 bytes) ===
libc: 2.06967 ns/call, 28.7991 GiB/s
neon: 2.59914 ns/call, 22.9325 GiB/s
sve: 2.58628 ns/call, 23.0465 GiB/s

=== strlen(256 bytes) ===
libc: 3.74165 ns/call, 63.7202 GiB/s
neon: 8.98243 ns/call, 26.5428 GiB/s
sve: 7.36426 ns/call, 32.3751 GiB/s

=== strlen(1024 bytes) ===
libc: 10.5327 ns/call, 90.5438 GiB/s
neon: 34.363 ns/call, 27.7529 GiB/s
sve: 26.9329 ns/call, 35.4092 GiB/s

=== strlen(4096 bytes) ===
libc: 37.7304 ns/call, 101.104 GiB/s
neon: 145.911 ns/call, 26.144 GiB/s
sve: 103.208 ns/call, 36.9612 GiB/s

=== strlen(1048576 bytes) ===
libc: 9623.4 ns/call, 101.478 GiB/s
neon: 36138.2 ns/call, 27.023 GiB/s
sve: 26605.6 ns/call, 36.7051 GiB/s
```
---
 libc/fuzzing/__support/freelist_heap_fuzz.cpp |  2 +-
 libc/fuzzing/string/CMakeLists.txt            |  8 +++
 libc/fuzzing/string/strlen_fuzz.cpp           | 32 ++++++++++
 .../memory_utils/aarch64/inline_strlen.h      | 63 +++++++++++++++++--
 libc/test/src/string/strlen_test.cpp          | 12 ++++
 5 files changed, 111 insertions(+), 6 deletions(-)
 create mode 100644 libc/fuzzing/string/strlen_fuzz.cpp

diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
index 7b7985a83c3e6..0b400cb156491 100644
--- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp
+++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
@@ -24,7 +24,7 @@ asm(R"(
 _end:
   .fill 1024
 __llvm_libc_heap_limit:
-)";
+)");
 
 using LIBC_NAMESPACE::FreeListHeap;
 using LIBC_NAMESPACE::inline_memset;
diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt
index efda80b59c951..0918e92552ea7 100644
--- a/libc/fuzzing/string/CMakeLists.txt
+++ b/libc/fuzzing/string/CMakeLists.txt
@@ -40,3 +40,11 @@ add_libc_fuzzer(
   DEPENDS
     libc.src.strings.bcmp
 )
+
+add_libc_fuzzer(
+  strlen_fuzz
+  SRCS
+    strlen_fuzz.cpp
+  DEPENDS
+    libc.src.string.strlen
+)
diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp
new file mode 100644
index 0000000000000..dd72c19b7fdc7
--- /dev/null
+++ b/libc/fuzzing/string/strlen_fuzz.cpp
@@ -0,0 +1,32 @@
+//===-- strlen_fuzz.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc strlen implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/string/strlen.h"
+#include <cstdint>
+#include <cstring>
+
+// always null terminate the data
+extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
+extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size,
+                                          size_t max_size, unsigned int seed) {
+  size = LLVMFuzzerMutate(data, size, max_size);
+  data[size - 1] = '\0';
+  return size;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  size_t ref = ::strlen(reinterpret_cast<const char *>(data));
+  size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast<const char *>(data));
+  if (ref != impl)
+    __builtin_trap();
+  return 0;
+}
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 87f5ccdd56e23..eafaca9776a42 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -8,14 +8,13 @@
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
 
+#include "src/__support/macros/properties/cpu_features.h"
+
 #if defined(__ARM_NEON)
 #include "src/__support/CPP/bit.h" // countr_zero
-
 #include <arm_neon.h>
 #include <stddef.h> // size_t
-
 namespace LIBC_NAMESPACE_DECL {
-
 namespace neon {
 [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
 string_length(const char *src) {
@@ -45,9 +44,63 @@ string_length(const char *src) {
   }
 }
 } // namespace neon
+} // namespace LIBC_NAMESPACE_DECL
+#endif // __ARM_NEON
 
-namespace string_length_impl = neon;
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+#include "src/__support/macros/optimization.h"
+#include <arm_sve.h>
+namespace LIBC_NAMESPACE_DECL {
+namespace sve {
+[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
+  const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
+  // Initialize the first-fault register to all true
+  svsetffr();
+  const svbool_t all_true = svptrue_b8(); // all true predicate
+  svbool_t cmp_zero;
+  size_t len = 0;
 
+  for (;;) {
+    // Read a vector's worth of bytes, stopping on first fault.
+    svuint8_t data = svldff1_u8(all_true, &ptr[len]);
+    svbool_t fault_mask = svrdffr_z(all_true);
+    bool has_no_fault = svptest_last(all_true, fault_mask);
+    if (LIBC_LIKELY(has_no_fault)) {
+      // First fault did not fail: the whole vector is valid.
+      // Avoid depending on the contents of FFR beyond the branch.
+      len += svcntb(); // speculative increment
+      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
+      bool has_no_zero = !svptest_any(all_true, cmp_zero);
+      if (LIBC_LIKELY(has_no_zero))
+        continue;
+      len -= svcntb(); // undo speculative increment
+      break;
+    } else {
+      // First fault failed: only some of the vector is valid.
+      // Perform the comparison only on the valid bytes.
+      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
+      bool has_zero = svptest_any(fault_mask, cmp_zero);
+      if (LIBC_LIKELY(has_zero))
+        break;
+      svsetffr();
+      len += svcntp_b8(all_true, fault_mask);
+      continue;
+    }
+  }
+  // Select the bytes before the first and count them.
+  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
+  len += svcntp_b8(all_true, before_zero);
+  return len;
+}
+} // namespace sve
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_TARGET_CPU_HAS_SVE
+
+namespace LIBC_NAMESPACE_DECL {
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+namespace string_length_impl = sve;
+#elif defined(__ARM_NEON)
+namespace string_length_impl = neon;
+#endif
 } // namespace LIBC_NAMESPACE_DECL
-#endif // __ARM_NEON
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp
index 4eb9d47e9209d..784dd7b194b3f 100644
--- a/libc/test/src/string/strlen_test.cpp
+++ b/libc/test/src/string/strlen_test.cpp
@@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) {
   size_t result = LIBC_NAMESPACE::strlen(any);
   ASSERT_EQ((size_t)12, result);
 }
+
+TEST(LlvmLibcStrLenTest, DataAfterNulString) {
+  constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)6, result);
+}
+
+TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) {
+  constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'};
+  size_t result = LIBC_NAMESPACE::strlen(A);
+  ASSERT_EQ((size_t)2, result);
+}

From 222f4e494a0cd9515c242fd083c2776772734385 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 10 Nov 2025 16:21:39 -0800
Subject: [PATCH 38/97] [mlir] Add FP software implementation lowering pass:
 `arith-to-apfloat` (#166618)

This commit adds a new pass that lowers floating-point `arith`
operations to calls into the execution engine runtime library. Currently
supported operations: `addf`, `subf`, `mulf`, `divf`, `remf`.

All floating-point types that have an APFloat semantics are supported.
This includes low-precision floating-point types such as `f4E2M1FN` that
cannot execute natively on CPUs.

This commit also improves the `vector.print` lowering pattern to call
into the runtime library for floating-point types that are not supported
by LLVM. This is necessary to write a meaningful integration test.

The way it works is

```mlir
func.func @full_example() {
  %a = arith.constant 1.4 : f8E4M3FN
  %b = func.call @foo() : () -> (f8E4M3FN)
  %c = arith.addf %a, %b : f8E4M3FN
  vector.print %c : f8E4M3FN
  return
}
```

gets transformed to

```mlir
func.func private @__mlir_apfloat_add(i32, i64, i64) -> i6
func.func @full_example() {
  %cst = arith.constant 1.375000e+00 : f8E4M3FN
  %0 = call @foo() : () -> f8E4M3FN
  // bitcast operand A to integer of equal width
  %1 = arith.bitcast %cst : f8E4M3FN to i8
  // zext A to i64
  %2 = arith.extui %1 : i8 to i64
  // same for operand B
  %3 = arith.bitcast %0 : f8E4M3FN to i8
  %4 = arith.extui %3 : i8 to i64
  // get the llvm::fltSemantics(f8E4M3FN) as an enum
  %c10_i32 = arith.constant 10 : i32
  // call the impl against APFloat in mlir_apfloat_wrappers
  %5 = call @__mlir_apfloat_add(%c10_i32, %2, %4) : (i32, i64, i64) -> i64
  // "cast" back to the original fp type
  %6 = arith.trunci %5 : i64 to i8
  %7 = arith.bitcast %6 : i8 to f8E4M3FN
  vector.print %7 : f8E4M3FN
}
```

Note, `llvm::fltSemantics(f8E4M3FN)` is emitted by the pattern each time
an `arith` op is transformed, thereby making the call to
`__mlir_apfloat_add` correct (i.e., no name mangling on type necessary).


RFC:
https://discourse.llvm.org/t/rfc-software-implementation-for-unsupported-fp-types-in-convert-arith-to-llvm/88785

---------

Co-authored-by: Matthias Springer <me@m-sp.org>
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 +++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |  15 ++
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 +
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 +
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 161 ++++++++++++++++++
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  17 ++
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 +
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 ++
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 +++
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 ++
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  81 +++++++++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  12 ++
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 ++++++++++++++
 .../Arith/CPU/test-apfloat-emulation.mlir     |  34 ++++
 16 files changed, 533 insertions(+)
 create mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 create mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 create mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
new file mode 100644
index 0000000000000..64a42a228199e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
@@ -0,0 +1,21 @@
+//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
+//
+// Part of the APFloat Project, under the Apache License v2.0 with APFloat
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 40d866ec7bf10..82bdfd02661a6 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 70e3e45c225db..79bc380dbcb7a 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,6 +186,21 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAPFloat
+//===----------------------------------------------------------------------===//
+
+def ArithToAPFloatConversionPass
+    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
+  let summary = "Convert Arith ops to APFloat runtime library calls";
+  let description = [{
+    This pass converts supported Arith ops to APFloat-based runtime library
+    calls (APFloatWrappers.cpp). APFloat is a software implementation of
+    floating-point arithmetic operations.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 3576126a487ac..00d50874a2e8d 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,6 +60,13 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
+/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
+/// `name`. Return a failure if the FuncOp is found but with a different
+/// signature.
+FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                               FunctionType funcT,
+                               SymbolTableCollection *symbolTables = nullptr);
+
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8ad9ed18acebd..b09d32022e348 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,6 +52,10 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
new file mode 100644
index 0000000000000..012e934d3050f
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -0,0 +1,161 @@
+//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "arith-to-apfloat"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::func;
+
+static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
+                           StringRef name, FunctionType funcT, bool setPrivate,
+                           SymbolTableCollection *symbolTables = nullptr) {
+  OpBuilder::InsertionGuard g(b);
+  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
+  b.setInsertionPointToStart(&symTable->getRegion(0).front());
+  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
+  if (setPrivate)
+    funcOp.setPrivate();
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
+    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
+  }
+  return funcOp;
+}
+
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+
+  std::string funcName = (llvm::Twine("__mlir_apfloat_") + name).str();
+  FunctionType funcT =
+      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  FailureOr<FuncOp> func =
+      lookupFnDecl(symTable, funcName, funcT, symbolTables);
+  // Failed due to type mismatch.
+  if (failed(func))
+    return func;
+  // Successfully matched existing decl.
+  if (*func)
+    return *func;
+
+  return createFnDecl(b, symTable, funcName, funcT,
+                      /*setPrivate=*/true, symbolTables);
+}
+
+/// Rewrite a binary arithmetic operation to an APFloat function call.
+template <typename OpTy, const char *APFloatName>
+struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
+  BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit,
+                                   SymbolOpInterface symTable)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {};
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    FailureOr<FuncOp> fn =
+        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    auto int64Type = rewriter.getI64Type();
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    int32_t sem =
+        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+    Value semValue = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
+namespace {
+struct ArithToAPFloatConversionPass final
+    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    static const char add[] = "add";
+    static const char subtract[] = "subtract";
+    static const char multiply[] = "multiply";
+    static const char divide[] = "divide";
+    static const char remainder[] = "remainder";
+    patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp, add>,
+                 BinaryArithOpToAPFloatConversion<arith::SubFOp, subtract>,
+                 BinaryArithOpToAPFloatConversion<arith::MulFOp, multiply>,
+                 BinaryArithOpToAPFloatConversion<arith::DivFOp, divide>,
+                 BinaryArithOpToAPFloatConversion<arith::RemFOp, remainder>>(
+        context, 1, getOperation());
+    LogicalResult result = success();
+    ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
+      if (diag.getSeverity() == DiagnosticSeverity::Error) {
+        result = failure();
+      }
+      // NB: if you don't return failure, no other diag handlers will fire (see
+      // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
+      return failure();
+    });
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+    if (failed(result))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
new file mode 100644
index 0000000000000..b0d1e46b3655f
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_conversion_library(MLIRArithToAPFloat
+  ArithToAPFloat.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncDialect
+  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index b6099902cc337..f2bacc3399144 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index bebf1b8fff3f9..613dc6d242ceb 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 69a317ecd101f..c747e1b59558a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,6 +1654,20 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
+    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
+      // Print other floating-point types using the APFloat runtime library.
+      int32_t sem =
+          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+      Value semValue = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(),
+          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+      Value floatBits =
+          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
+      printer =
+          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
+      emitCall(rewriter, loc, printer.value(),
+               ValueRange({semValue, floatBits}));
+      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index b4cb0932ef631..d6dfd0229963c 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,3 +254,28 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
+
+FailureOr<func::FuncOp>
+func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                   FunctionType funcT, SymbolTableCollection *symbolTables) {
+  FuncOp func;
+  if (symbolTables) {
+    func = symbolTables->lookupSymbolIn<FuncOp>(
+        symTable, StringAttr::get(symTable->getContext(), name));
+  } else {
+    func = llvm::dyn_cast_or_null<FuncOp>(
+        SymbolTable::lookupSymbolIn(symTable, name));
+  }
+
+  if (!func)
+    return func;
+
+  mlir::FunctionType foundFuncT = func.getFunctionType();
+  // Assert the signature of the found function is same as expected
+  if (funcT != foundFuncT) {
+    return func.emitError("matched function '")
+           << name << "' but with different type: " << foundFuncT
+           << " (expected " << funcT << ")";
+  }
+  return func;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..160b6ae89215c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,6 +30,7 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
+static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -160,6 +161,16 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kPrintApFloat,
+      {IntegerType::get(moduleOp->getContext(), 32),
+       IntegerType::get(moduleOp->getContext(), 64)},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
+}
+
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
new file mode 100644
index 0000000000000..85ea0986cde5b
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -0,0 +1,81 @@
+//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the APFloat infrastructure to MLIR programs as a runtime
+// library. APFloat is a software implementation of floating point arithmetics.
+//
+// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
+// before calling a runtime function. If a floating-point type has less than
+// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
+// integer.
+//
+// Runtime functions receive the floating-point operands of the arithmeic
+// operation in the form of 64-bit integers, along with the APFloat semantics
+// in the form of a 32-bit integer, which will be interpreted as an
+// APFloatBase::Semantics enum value.
+//
+#include "llvm/ADT/APFloat.h"
+
+#if (defined(_WIN32) || defined(__CYGWIN__))
+#define MLIR_APFLOAT_WRAPPERS_EXPORTED __declspec(dllexport)
+#else
+#define MLIR_APFLOAT_WRAPPERS_EXPORTED __attribute__((visibility("default")))
+#endif
+
+/// Binary operations without rounding mode.
+#define APFLOAT_BINARY_OP(OP)                                                  \
+  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs);                                                               \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+/// Binary operations with rounding mode.
+#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
+  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs, ROUNDING_MODE);                                                \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+extern "C" {
+
+#define BIN_OPS_WITH_ROUNDING(X)                                               \
+  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
+  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(divide, llvm::RoundingMode::NearestTiesToEven)
+
+BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
+#undef BIN_OPS_WITH_ROUNDING
+#undef APFLOAT_BINARY_OP_ROUNDING_MODE
+
+APFLOAT_BINARY_OP(remainder)
+
+#undef APFLOAT_BINARY_OP
+
+void MLIR_APFLOAT_WRAPPERS_EXPORTED printApFloat(int32_t semantics,
+                                                 uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  double d = x.convertToDouble();
+  fprintf(stdout, "%lg", d);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index fdeb4dacf9278..8c09e50e4de7b 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -167,6 +168,15 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
+  add_mlir_library(mlir_apfloat_wrappers
+    SHARED
+    APFloatWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+    )
+  set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
+
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -177,6 +187,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -191,6 +202,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
new file mode 100644
index 0000000000000..fe4d28a56f808
--- /dev/null
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
+
+// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
+// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
+// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
+// CHECK:         }
+
+// Illustrate that both f8E4M3FN and f6E3M2FN calling the same __mlir_apfloat_add is fine
+// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
+// CHECK-LABEL:   func.func @full_example() {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
+// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
+// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
+// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
+//                  // fltSemantics semantics for f8E4M3FN
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_1:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
+// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
+// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
+
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
+// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
+// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
+// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
+//                  // fltSemantics semantics for f6E3M2FN
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
+// CHECK:           %[[VAL_3:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
+// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
+// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
+// CHECK:           return
+// CHECK:         }
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> f8E4M3FN {
+  %cst = arith.constant 2.2 : f8E4M3FN
+  return %cst : f8E4M3FN
+}
+
+func.func @bar() -> f6E3M2FN {
+  %cst = arith.constant 3.2 : f6E3M2FN
+  return %cst : f6E3M2FN
+}
+
+func.func @full_example() {
+  %a = arith.constant 1.4 : f8E4M3FN
+  %b = func.call @foo() : () -> (f8E4M3FN)
+  %c = arith.addf %a, %b : f8E4M3FN
+  vector.print %c : f8E4M3FN
+
+  %d = arith.constant 2.4 : f6E3M2FN
+  %e = func.call @bar() : () -> (f6E3M2FN)
+  %f = arith.addf %d, %e : f6E3M2FN
+  vector.print %f : f6E3M2FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// Test decl collision (different type)
+// expected-error@+1{{matched function '__mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
+func.func private @__mlir_apfloat_add(i32, i32, f32) -> index
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_subtract(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_multiply(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_divide(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_remainder(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
new file mode 100644
index 0000000000000..a2b3eb73a60b8
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -0,0 +1,34 @@
+// Case 1: All floating-point arithmetics is lowered through APFloat.
+// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
+
+// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
+//         Arithmetics on f32 is lowered directly to LLVM.
+// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
+// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> (f8E4M3FN, f32) {
+  %cst1 = arith.constant 2.2 : f8E4M3FN
+  %cst2 = arith.constant 2.2 : f32
+  return %cst1, %cst2 : f8E4M3FN, f32
+}
+
+func.func @entry() {
+  %a1 = arith.constant 1.4 : f8E4M3FN
+  %a2 = arith.constant 1.4 : f32
+  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
+
+  // CHECK: 3.5
+  vector.print %c1 : f8E4M3FN
+
+  // CHECK: 3.6
+  vector.print %c2 : f32
+
+  return
+}

From a88fa64e2570bac79545e24c010bf2fdc21162cf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 16:23:36 -0800
Subject: [PATCH 39/97] CodeGen: Remove TRI argument from reMaterialize
 (#158229)

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 +--
 llvm/lib/CodeGen/LiveRangeEdit.cpp          | 2 +-
 llvm/lib/CodeGen/MachineSink.cpp            | 2 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp        | 8 ++++----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 ++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      | 6 +++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        | 3 +--
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp    | 3 +--
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h      | 3 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp        | 3 +--
 llvm/lib/Target/X86/X86InstrInfo.h          | 3 +--
 11 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 2a0b1a6b183bc..7940cd81d319b 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -463,8 +463,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// SubIdx.
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI, Register DestReg,
-                             unsigned SubIdx, const MachineInstr &Orig,
-                             const TargetRegisterInfo &TRI) const;
+                             unsigned SubIdx, const MachineInstr &Orig) const;
 
   /// Clones instruction or the whole instruction bundle \p Orig and
   /// insert into \p MBB before \p InsertBefore. The target may update operands
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 5b0365da4e8c6..6fe11704a9137 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -88,7 +88,7 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          bool Late, unsigned SubIdx,
                                          MachineInstr *ReplaceIndexMI) {
   assert(RM.OrigMI && "Invalid remat");
-  TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri);
+  TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI);
   // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg
   // to false anyway in case the isDead flag of RM.OrigMI's dest register
   // is true.
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 94ed82eee9b8f..0ceeda4eb16d2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -569,7 +569,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI,
       // Sink a copy of the instruction, replacing a COPY instruction.
       MachineBasicBlock::iterator InsertPt = SinkDst->getIterator();
       Register DstReg = SinkDst->getOperand(0).getReg();
-      TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI, *TRI);
+      TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI);
       New = &*std::prev(InsertPt);
       if (!New->getDebugLoc())
         New->setDebugLoc(SinkDst->getDebugLoc());
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 6c09d6c650456..8fac77f22cd8a 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -447,10 +447,10 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
   return true;
 }
 
-void TargetInstrInfo::reMaterialize(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    unsigned SubIdx, const MachineInstr &Orig,
-    const TargetRegisterInfo & /*Remove me*/) const {
+void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    Register DestReg, unsigned SubIdx,
+                                    const MachineInstr &Orig) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
   MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9fbf9e5fe8eeb..23ba4ad718415 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2011,7 +2011,7 @@ void PreRARematStage::rematerialize() {
 
     // Rematerialize DefMI to its use block.
     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                       AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
+                       AMDGPU::NoSubRegister, *DefMI);
     Remat.RematMI = &*std::prev(InsertPos);
     DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
 
@@ -2163,8 +2163,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
     // the region, but it should not matter much.
-    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
-                       *DAG.TRI);
+    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 89f490356aa26..e6bbbcf5f1afd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2519,8 +2519,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
 void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I, Register DestReg,
-                                unsigned SubIdx, const MachineInstr &Orig,
-                                const TargetRegisterInfo &RI) const {
+                                unsigned SubIdx,
+                                const MachineInstr &Orig) const {
 
   // Try shrinking the instruction to remat only the part needed for current
   // context.
@@ -2600,7 +2600,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
     break;
   }
 
-  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
 }
 
 std::pair<MachineInstr*, MachineInstr*>
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6966e45027627..3294a8bbf1b94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -321,8 +321,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 32f696c7c9b56..36fc7b19f60e6 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1653,8 +1653,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
 void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      Register DestReg, unsigned SubIdx,
-                                     const MachineInstr &Orig,
-                                     const TargetRegisterInfo &TRI) const {
+                                     const MachineInstr &Orig) const {
   unsigned Opcode = Orig.getOpcode();
   switch (Opcode) {
   default: {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index c817e94dd9c76..f897a35783b3f 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -232,8 +232,7 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   MachineInstr &
   duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 86ce7645eb91c..ae966427cf1f8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -957,8 +957,7 @@ bool X86InstrInfo::isReMaterializableImpl(
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  Register DestReg, unsigned SubIdx,
-                                 const MachineInstr &Orig,
-                                 const TargetRegisterInfo &TRI) const {
+                                 const MachineInstr &Orig) const {
   bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
   if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
                             MachineBasicBlock::LQR_Dead) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index ec7db70e51aaf..5545e39d1f6e5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -342,8 +342,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   bool isReMaterializableImpl(const MachineInstr &MI) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
-                     const MachineInstr &Orig,
-                     const TargetRegisterInfo &TRI) const override;
+                     const MachineInstr &Orig) const override;
 
   /// Given an operand within a MachineInstr, insert preceding code to put it
   /// into the right format for a particular kind of LEA instruction. This may

From 1f3f522866ae483836cd39c13695a0ab41b547d5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 16:24:39 -0800
Subject: [PATCH 40/97] CodeGen: Remove TRI arguments from stack load/store
 hooks (#158240)

This is directly available in TargetInstrInfo
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  6 +--
 .../CodeGen/FixupStatepointCallerSaved.cpp    |  6 +--
 llvm/lib/CodeGen/InlineSpiller.cpp            |  8 +--
 llvm/lib/CodeGen/RegAllocFast.cpp             |  7 ++-
 llvm/lib/CodeGen/RegisterScavenging.cpp       |  4 +-
 llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp  |  5 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 24 ++++-----
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  5 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  8 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  6 +--
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |  9 ++--
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp      | 17 ++++---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h        |  5 +-
 llvm/lib/Target/ARM/Thumb1InstrInfo.cpp       | 11 +++--
 llvm/lib/Target/ARM/Thumb1InstrInfo.h         |  5 +-
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp       | 16 +++---
 llvm/lib/Target/ARM/Thumb2InstrInfo.h         |  5 +-
 llvm/lib/Target/AVR/AVRInstrInfo.cpp          | 12 ++---
 llvm/lib/Target/AVR/AVRInstrInfo.h            |  6 +--
 llvm/lib/Target/BPF/BPFInstrInfo.cpp          | 11 +++--
 llvm/lib/Target/BPF/BPFInstrInfo.h            |  5 +-
 .../Target/Hexagon/HexagonFrameLowering.cpp   | 11 ++---
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  | 11 +++--
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h    |  5 +-
 llvm/lib/Target/Lanai/LanaiInstrInfo.cpp      |  6 +--
 llvm/lib/Target/Lanai/LanaiInstrInfo.h        |  6 +--
 .../LoongArch/LoongArchFrameLowering.cpp      |  2 +-
 .../Target/LoongArch/LoongArchInstrInfo.cpp   | 16 +++---
 .../lib/Target/LoongArch/LoongArchInstrInfo.h |  6 +--
 llvm/lib/Target/MSP430/MSP430InstrInfo.cpp    | 13 ++---
 llvm/lib/Target/MSP430/MSP430InstrInfo.h      |  6 +--
 llvm/lib/Target/Mips/Mips16InstrInfo.cpp      | 11 +++--
 llvm/lib/Target/Mips/Mips16InstrInfo.h        |  5 +-
 llvm/lib/Target/Mips/MipsInstrInfo.h          | 15 +++---
 llvm/lib/Target/Mips/MipsSEFrameLowering.cpp  | 49 ++++++++-----------
 llvm/lib/Target/Mips/MipsSEInstrInfo.cpp      | 39 ++++++++-------
 llvm/lib/Target/Mips/MipsSEInstrInfo.h        |  5 +-
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp  | 11 ++---
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      | 23 +++++----
 llvm/lib/Target/PowerPC/PPCInstrInfo.h        | 12 ++---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 28 +++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 10 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 27 +++++-----
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  6 +--
 llvm/lib/Target/Sparc/SparcInstrInfo.cpp      | 11 +++--
 llvm/lib/Target/Sparc/SparcInstrInfo.h        |  5 +-
 .../Target/SystemZ/SystemZFrameLowering.cpp   | 16 +++---
 llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp  | 14 +++---
 llvm/lib/Target/SystemZ/SystemZInstrInfo.h    |  6 ++-
 llvm/lib/Target/VE/VEInstrInfo.cpp            | 11 +++--
 llvm/lib/Target/VE/VEInstrInfo.h              |  6 ++-
 llvm/lib/Target/X86/X86FastPreTileConfig.cpp  |  3 +-
 llvm/lib/Target/X86/X86FrameLowering.cpp      |  7 ++-
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 24 ++++-----
 llvm/lib/Target/X86/X86InstrInfo.h            |  6 +--
 llvm/lib/Target/XCore/XCoreFrameLowering.cpp  |  5 +-
 llvm/lib/Target/XCore/XCoreInstrInfo.cpp      |  5 +-
 llvm/lib/Target/XCore/XCoreInstrInfo.h        |  6 ++-
 .../lib/Target/Xtensa/XtensaFrameLowering.cpp |  2 +-
 llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp    | 18 ++++---
 llvm/lib/Target/Xtensa/XtensaInstrInfo.h      |  5 +-
 62 files changed, 314 insertions(+), 344 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 7940cd81d319b..18142c2c0adf3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1196,8 +1196,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// register spill instruction, part of prologue, during the frame lowering.
   virtual void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const {
     llvm_unreachable("Target didn't implement "
                      "TargetInstrInfo::storeRegToStackSlot!");
@@ -1215,8 +1214,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// register reload instruction, part of epilogue, during the frame lowering.
   virtual void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const {
     llvm_unreachable("Target didn't implement "
                      "TargetInstrInfo::loadRegFromStackSlot!");
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
index 8b74dcebd00ac..c23cac7974d51 100644
--- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -420,7 +420,7 @@ class StatepointState {
 
       LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
       TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI,
-                              RC, &TRI, Register());
+                              RC, Register());
     }
   }
 
@@ -429,7 +429,7 @@ class StatepointState {
     const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
     int FI = RegToSlotIdx[Reg];
     if (It != MBB->end()) {
-      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register());
       return;
     }
 
@@ -437,7 +437,7 @@ class StatepointState {
     // and then swap them.
     assert(!MBB->empty() && "Empty block");
     --It;
-    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register());
     MachineInstr *Reload = It->getPrevNode();
     int Dummy = 0;
     (void)Dummy;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c3e0964594bd5..68370303a3aef 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -473,7 +473,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   MachineInstrSpan MIS(MII, MBB);
   // Insert spill without kill flag immediately after def.
   TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
-                          MRI.getRegClass(SrcReg), &TRI, Register());
+                          MRI.getRegClass(SrcReg), Register());
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
   for (const MachineInstr &MI : make_range(MIS.begin(), MII))
     getVDefInterval(MI, LIS);
@@ -1119,7 +1119,7 @@ void InlineSpiller::insertReload(Register NewVReg,
 
   MachineInstrSpan MIS(MI, &MBB);
   TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
-                           MRI.getRegClass(NewVReg), &TRI, Register());
+                           MRI.getRegClass(NewVReg), Register());
 
   LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
 
@@ -1155,7 +1155,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
 
   if (IsRealSpill)
     TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
-                            MRI.getRegClass(NewVReg), &TRI, Register());
+                            MRI.getRegClass(NewVReg), Register());
   else
     // Don't spill undef value.
     // Anything works for undef, in particular keeping the memory
@@ -1729,7 +1729,7 @@ void HoistSpillHelper::hoistAllSpills() {
       MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB);
       MachineInstrSpan MIS(MII, BB);
       TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot,
-                              MRI.getRegClass(LiveReg), &TRI, Register());
+                              MRI.getRegClass(LiveReg), Register());
       LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
       for (const MachineInstr &MI : make_range(MIS.begin(), MII))
         getVDefInterval(MI, LIS);
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index ec6ffd4809246..9097728c84e7e 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -594,8 +594,7 @@ void RegAllocFastImpl::spill(MachineBasicBlock::iterator Before,
   LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI,
-                           VirtReg);
+  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, VirtReg);
   ++NumStores;
 
   MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator();
@@ -652,7 +651,7 @@ void RegAllocFastImpl::reload(MachineBasicBlock::iterator Before,
                     << printReg(PhysReg, TRI) << '\n');
   int FI = getStackSpaceFor(VirtReg);
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg);
+  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, VirtReg);
   ++NumLoads;
 }
 
@@ -1123,7 +1122,7 @@ bool RegAllocFastImpl::defineVirtReg(MachineInstr &MI, unsigned OpNum,
           if (MO.isMBB()) {
             MachineBasicBlock *Succ = MO.getMBB();
             TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill, FI,
-                                     &RC, TRI, VirtReg);
+                                     &RC, VirtReg);
             ++NumStores;
             Succ->addLiveIn(PhysReg);
           }
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index 7e26c2ed59949..d8861672a348f 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -276,14 +276,14 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
                          ": Cannot scavenge register without an emergency "
                          "spill slot!");
     }
-    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register());
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, Register());
     MachineBasicBlock::iterator II = std::prev(Before);
 
     unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
-    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register());
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, Register());
     II = std::prev(UseMI);
 
     FIOperandNum = getFrameIndexOperandNum(*II);
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 70c3b2cbae9a6..ebf6d1a52448e 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -198,7 +198,7 @@ void TargetFrameLowering::spillCalleeSavedRegister(
   } else {
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
-                             TRI, Register());
+                             Register());
   }
 }
 
@@ -212,8 +212,7 @@ void TargetFrameLowering::restoreCalleeSavedRegister(
         .addReg(CS.getDstReg(), getKillRegState(true));
   } else {
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                              Register());
+    TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
   }
 }
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 8fac77f22cd8a..76f99848a823c 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -794,11 +794,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
       // code.
       BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO);
     } else {
-      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, &TRI,
+      storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC,
                           Register());
     }
   } else
-    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, &TRI, Register());
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, Register());
 
   return &*--Pos;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f72a6f0140b48..b93e562f4cee5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5664,7 +5664,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
@@ -5678,7 +5677,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   bool Offset = true;
   MCRegister PNRReg = MCRegister::NoRegister;
   unsigned StackID = TargetStackID::Default;
-  switch (TRI->getSpillSize(*RC)) {
+  switch (RI.getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRBui;
@@ -5841,10 +5840,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
       .addMemOperand(MMO);
 }
 
-void AArch64InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
@@ -5856,7 +5857,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   bool Offset = true;
   unsigned StackID = TargetStackID::Default;
   Register PNRReg = MCRegister::NoRegister;
-  switch (TRI->getSpillSize(*RC)) {
+  switch (TRI.getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRBui;
@@ -6492,10 +6493,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
              "Mismatched register size in non subreg COPY");
       if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
-                            getRegClass(SrcReg), &TRI, Register());
+                            getRegClass(SrcReg), Register());
       else
         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
-                             getRegClass(DstReg), &TRI, Register());
+                             getRegClass(DstReg), Register());
       return &*--InsertPt;
     }
 
@@ -6513,8 +6514,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
       assert(SrcMO.getSubReg() == 0 &&
              "Unexpected subreg on physical register");
       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
-                          FrameIndex, &AArch64::GPR64RegClass, &TRI,
-                          Register());
+                          FrameIndex, &AArch64::GPR64RegClass, Register());
       return &*--InsertPt;
     }
 
@@ -6548,7 +6548,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
                    TRI.getRegSizeInBits(*FillRC) &&
                "Mismatched regclass size on folded subreg COPY");
-        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
                              Register());
         MachineInstr &LoadMI = *--InsertPt;
         MachineOperand &LoadDst = LoadMI.getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 179574a73aa01..979c9acbd48e1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -353,14 +353,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // This tells target independent code that it is okay to pass instructions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6bbbcf5f1afd..9c7804067270f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1668,8 +1668,7 @@ unsigned SIInstrInfo::getVectorRegSpillSaveOpcode(
 
 void SIInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -1681,7 +1680,7 @@ void SIInstrInfo::storeRegToStackSlot(
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
       FrameInfo.getObjectAlign(FrameIndex));
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   if (RI.isSGPRClass(RC)) {
@@ -1863,14 +1862,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI,
                                        Register VReg,
                                        MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const DebugLoc &DL = MBB.findDebugLoc(MI);
-  unsigned SpillSize = TRI->getSpillSize(*RC);
+  unsigned SpillSize = RI.getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 3294a8bbf1b94..c048b85b1e99a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -307,14 +307,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 40eeeb8a8630d..cbd08f0fb5dff 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -117,27 +117,26 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
   MachineFunction &MF = *SaveBlock.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *RI = ST.getRegisterInfo();
 
   MachineBasicBlock::iterator I = SaveBlock.begin();
-  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI)) {
     for (const CalleeSavedInfo &CS : CSI) {
       // Insert the spill to the stack frame.
       MCRegister Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
+      const TargetRegisterClass *RC = RI->getMinimalPhysRegClass(
           Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
 
       // If this value was already livein, we probably have a direct use of the
       // incoming register value, so don't kill at the spill point. This happens
       // since we pass some special inputs (workgroup IDs) in the callee saved
       // range.
-      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, TRI);
+      const bool IsLiveIn = isLiveIntoMBB(Reg, SaveBlock, RI);
       TII.storeRegToStackSlot(SaveBlock, I, Reg, !IsLiveIn, CS.getFrameIdx(),
-                              RC, TRI, Register());
+                              RC, Register());
 
       if (Indexes) {
         assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 36fc7b19f60e6..6077c18463240 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -945,18 +945,18 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   Align Alignment = MFI.getObjectAlign(FI);
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  switch (TRI.getSpillSize(*RC)) {
     case 2:
       if (ARM::HPRRegClass.hasSubClassEq(RC)) {
         BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH))
@@ -1208,10 +1208,12 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
   return false;
 }
 
-void ARMBaseInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
@@ -1221,7 +1223,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Alignment);
 
-  switch (TRI->getSpillSize(*RC)) {
+  const ARMBaseRegisterInfo &TRI = getRegisterInfo();
+  switch (TRI.getSpillSize(*RC)) {
   case 2:
     if (ARM::HPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg)
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index f897a35783b3f..04e2ab055cf1a 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -216,14 +216,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index f95ba6a4d86fb..01f588f0cdc38 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -116,7 +116,6 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   assert((RC == &ARM::tGPRRegClass ||
@@ -142,10 +141,12 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void Thumb1InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) ||
           (DestReg.isPhysical() && isARMLowRegister(DestReg))) &&
          "Unknown regclass!");
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 16350a65a6198..289a30a4ca1e4 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -43,14 +43,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 032acf559ffc3..efb92c9bcac18 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -165,7 +165,6 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register SrcReg, bool isKill, int FI,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -203,14 +202,16 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI,
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC,
                                         Register());
 }
 
-void Thumb2InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void Thumb2InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           Register DestReg, int FI,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -247,8 +248,7 @@ void Thumb2InstrInfo::loadRegFromStackSlot(
     return;
   }
 
-  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI,
-                                         Register());
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, Register());
 }
 
 void Thumb2InstrInfo::expandLoadStackGuard(
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 59ef39d442aef..1e11cb37efc05 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -44,14 +44,13 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 5e247cb64a991..6c37ba1411dde 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -126,8 +126,7 @@ Register AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 
 void AVRInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
   AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
@@ -142,9 +141,9 @@ void AVRInstrInfo::storeRegToStackSlot(
       MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
-  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
+  if (RI.isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::STDPtrQRr;
-  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
+  } else if (RI.isTypeLegalForClass(*RC, MVT::i16)) {
     Opcode = AVR::STDWPtrQRr;
   } else {
     llvm_unreachable("Cannot store this register into a stack slot!");
@@ -161,7 +160,6 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                         Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   MachineFunction &MF = *MBB.getParent();
@@ -173,9 +171,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
-  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
+  if (TRI.isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::LDDRdPtrQ;
-  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
+  } else if (TRI.isTypeLegalForClass(*RC, MVT::i16)) {
     // Opcode = AVR::LDDWRdPtrQ;
     //: FIXME: remove this once PR13375 gets fixed
     Opcode = AVR::LDDWRdYQ;
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index 759aea2010962..4db535a990451 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -79,13 +79,11 @@ class AVRInstrInfo : public AVRGenInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 0e56e658952a2..095e2497eec17 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -127,7 +127,6 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
                                        Register SrcReg, bool IsKill, int FI,
                                        const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI,
                                        Register VReg,
                                        MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -148,10 +147,12 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't store this register to stack slot");
 }
 
-void BPFInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        Register DestReg, int FI,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index 911e880166d29..d3ef9bc164f4a 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -39,14 +39,13 @@ class BPFInstrInfo : public BPFGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a90255d1f62e6..df612262def5e 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1405,7 +1405,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
     bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
     int FI = I.getFrameIdx();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
-    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register());
+    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, Register());
     if (IsKill)
       MBB.addLiveIn(Reg);
   }
@@ -1470,7 +1470,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
     MCRegister Reg = I.getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     int FI = I.getFrameIdx();
-    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register());
+    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, Register());
   }
 
   return true;
@@ -1814,8 +1814,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
     .addReg(SrcR, getKillRegState(IsKill))
     .addReg(TmpR0, RegState::Kill);
 
-  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI, Register());
+  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, Register());
   expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
 
   NewRegs.push_back(TmpR0);
@@ -1844,9 +1843,7 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
 
   BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
     .addImm(0x01010101);
-  MachineFunction &MF = *B.getParent();
-  auto *HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI, Register());
+  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, Register());
   expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
 
   BuildMI(B, It, DL, HII.get(Hexagon::V6_vandvrt), DstR)
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index dd9f2fa5cf342..7682af4543b7c 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -964,7 +964,6 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator I,
                                            Register SrcReg, bool isKill, int FI,
                                            const TargetRegisterClass *RC,
-                                           const TargetRegisterInfo *TRI,
                                            Register VReg,
                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
@@ -1009,10 +1008,12 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void HexagonInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            Register DestReg, int FI,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 7a0c77cc3f705..796b978a2c3f0 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -188,8 +188,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   /// is true, the register operand is the last use and must be marked kill.
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// Load the specified register of the given register class from the specified
@@ -198,7 +197,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   /// This function is called for all pseudo instructions
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index b3d28565d4f06..14b7557e7f94a 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -49,8 +49,7 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void LanaiInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
     Register SourceRegister, bool IsKill, int FrameIndex,
-    const TargetRegisterClass *RegisterClass,
-    const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/,
+    const TargetRegisterClass *RegisterClass, Register /*VReg*/,
     MachineInstr::MIFlag /*Flags*/) const {
   DebugLoc DL;
   if (Position != MBB.end()) {
@@ -70,8 +69,7 @@ void LanaiInstrInfo::storeRegToStackSlot(
 void LanaiInstrInfo::loadRegFromStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
     Register DestinationRegister, int FrameIndex,
-    const TargetRegisterClass *RegisterClass,
-    const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/,
+    const TargetRegisterClass *RegisterClass, Register /*VReg*/,
     MachineInstr::MIFlag /*Flags*/) const {
   DebugLoc DL;
   if (Position != MBB.end()) {
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index d98276243dc31..155e2f03be630 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -58,15 +58,13 @@ class LanaiInstrInfo : public LanaiGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
       Register SourceRegister, bool IsKill, int FrameIndex,
-      const TargetRegisterClass *RegisterClass,
-      const TargetRegisterInfo *RegisterInfo, Register VReg,
+      const TargetRegisterClass *RegisterClass, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
       Register DestinationRegister, int FrameIndex,
-      const TargetRegisterClass *RegisterClass,
-      const TargetRegisterInfo *RegisterInfo, Register VReg,
+      const TargetRegisterClass *RegisterClass, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 1493bf4cba695..690b0639484d0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -449,7 +449,7 @@ bool LoongArchFrameLowering::spillCalleeSavedRegisters(
     bool IsKill =
         !(Reg == LoongArch::R1 && MF->getFrameInfo().isReturnAddressTaken());
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC,
                             Register());
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 5eb3bd6d01a64..9fc862af7ea24 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -113,14 +113,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void LoongArchInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
     bool IsKill, int FI, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
-    Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+    Opcode = TRI.getRegSizeInBits(LoongArch::GPRRegClass) == 32
                  ? LoongArch::ST_W
                  : LoongArch::ST_D;
   else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
@@ -149,8 +149,8 @@ void LoongArchInstrInfo::storeRegToStackSlot(
 
 void LoongArchInstrInfo::loadRegFromStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+    int FI, const TargetRegisterClass *RC, Register VReg,
+    MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   DebugLoc DL;
@@ -159,7 +159,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot(
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
-    Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32
+    Opcode = RegInfo.getRegSizeInBits(LoongArch::GPRRegClass) == 32
                  ? LoongArch::LD_W
                  : LoongArch::LD_D;
   else if (LoongArch::FPR32RegClass.hasSubClassEq(RC))
@@ -665,13 +665,13 @@ void LoongArchInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
     if (FrameIndex == -1)
       report_fatal_error("The function size is incorrectly estimated.");
     storeRegToStackSlot(MBB, PCALAU12I, Scav, /*IsKill=*/true, FrameIndex,
-                        &LoongArch::GPRRegClass, TRI, Register());
+                        &LoongArch::GPRRegClass, Register());
     TRI->eliminateFrameIndex(std::prev(PCALAU12I.getIterator()),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
     PCALAU12I.getOperand(1).setMBB(&RestoreBB);
     ADDI.getOperand(2).setMBB(&RestoreBB);
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), Scav, FrameIndex,
-                         &LoongArch::GPRRegClass, TRI, Register());
+                         &LoongArch::GPRRegClass, Register());
     TRI->eliminateFrameIndex(RestoreBB.back(),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
   }
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index d43d229256c13..9f7a0a2239a87 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -40,13 +40,11 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Materializes the given integer Val into DstReg.
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index af053b8645b9b..0fb4e9d9fcb62 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -32,8 +32,7 @@ MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI)
 
 void MSP430InstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -57,10 +56,12 @@ void MSP430InstrInfo::storeRegToStackSlot(
     llvm_unreachable("Cannot store this register to stack slot!");
 }
 
-void MSP430InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           Register DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index 316c136890bf8..c0a398452ef6d 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,13 +42,11 @@ class MSP430InstrInfo : public MSP430GenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIdx, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index 69b96cf3fc933..d23ec57d46e17 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -101,7 +101,6 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       int64_t Offset,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -116,10 +115,12 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
       .addMemOperand(MMO);
 }
 
-void Mips16InstrInfo::loadRegFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    int64_t Offset, MachineInstr::MIFlag Flags) const {
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       int64_t Offset,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h
index 2834fd3d7eec2..4300d086f0614 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -56,13 +56,14 @@ class Mips16InstrInfo : public MipsInstrInfo {
   void storeRegToStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index fc9424808756a..0b90972977d5e 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -147,31 +147,28 @@ class MipsInstrInfo : public MipsGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override {
-    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0, Flags);
+    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, 0, Flags);
   }
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override {
-    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0, Flags);
+    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, 0, Flags);
   }
 
   virtual void
   storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                   Register SrcReg, bool isKill, int FrameIndex,
-                  const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-                  int64_t Offset,
+                  const TargetRegisterClass *RC, int64_t Offset,
                   MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0;
 
   virtual void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int FrameIndex, const TargetRegisterClass *RC, int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0;
 
   virtual void adjustStackPtr(unsigned SP, int64_t Amount,
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index f08704a7e799c..942194cf31d44 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -172,7 +172,7 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
   Register VR = MRI.createVirtualRegister(RC);
   Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
 
-  TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+  TII.loadRegFromStack(MBB, I, VR, FI, RC, 0);
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
     .addReg(VR, RegState::Kill);
 }
@@ -189,7 +189,7 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
 
   BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
     .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
-  TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+  TII.storeRegToStack(MBB, I, VR, true, FI, RC, 0);
 }
 
 void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
@@ -210,9 +210,9 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
   DebugLoc DL = I->getDebugLoc();
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
 
-  TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+  TII.loadRegFromStack(MBB, I, VR0, FI, RC, 0);
   BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
-  TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+  TII.loadRegFromStack(MBB, I, VR1, FI, RC, RegSize);
   BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
 }
 
@@ -234,9 +234,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
   DebugLoc DL = I->getDebugLoc();
 
   BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
-  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, 0);
   BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
-  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, RegSize);
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
@@ -321,11 +321,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC2);
     if (!Subtarget.isLittle())
       std::swap(LoReg, HiReg);
-    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
-                        &RegInfo, 0);
-    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC,
-                        &RegInfo, 4);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, 0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, 4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, 0);
     return true;
   }
 
@@ -385,8 +383,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
     int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC);
-    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
-    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
+    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, 0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, Offset);
     return true;
   }
 
@@ -480,8 +478,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       if (!MBB.isLiveIn(ABI.GetEhDataReg(I)))
         MBB.addLiveIn(ABI.GetEhDataReg(I));
       TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false,
-                              MipsFI->getEhDataRegFI(I), RC, &RegInfo,
-                              Register());
+                              MipsFI->getEhDataRegFI(I), RC, Register());
     }
 
     // Emit .cfi_offset directives for eh data registers.
@@ -579,8 +576,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
       .setMIFlag(MachineInstr::FrameSetup);
 
   STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
-                                      MipsFI->getISRRegFI(0), PtrRC,
-                                      STI.getRegisterInfo(), 0);
+                                      MipsFI->getISRRegFI(0), PtrRC, 0);
 
   // Fetch and Spill Status
   MBB.addLiveIn(Mips::COP012);
@@ -590,8 +586,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub(
       .setMIFlag(MachineInstr::FrameSetup);
 
   STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
-                                      MipsFI->getISRRegFI(1), PtrRC,
-                                      STI.getRegisterInfo(), 0);
+                                      MipsFI->getISRRegFI(1), PtrRC, 0);
 
   // Build the configuration for disabling lower priority interrupts. Non EIC
   // interrupts need to be masked off with zero, EIC from the Cause register.
@@ -657,7 +652,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
   const MipsSEInstrInfo &TII =
       *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
-  const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo();
 
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   MipsABIInfo ABI = STI.getABI();
@@ -690,8 +684,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     // Insert instructions that restore eh data registers.
     for (int J = 0; J < 4; ++J) {
       TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J),
-                               MipsFI->getEhDataRegFI(J), RC, &RegInfo,
-                               Register());
+                               MipsFI->getEhDataRegFI(J), RC, Register());
     }
   }
 
@@ -722,17 +715,15 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub(
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
 
   // Restore EPC
-  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
-                                           MipsFI->getISRRegFI(0), PtrRC,
-                                           STI.getRegisterInfo(), Register());
+  STI.getInstrInfo()->loadRegFromStackSlot(
+      MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(0), PtrRC, Register());
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
       .addReg(Mips::K1)
       .addImm(0);
 
   // Restore Status
-  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
-                                           MipsFI->getISRRegFI(1), PtrRC,
-                                           STI.getRegisterInfo(), Register());
+  STI.getInstrInfo()->loadRegFromStackSlot(
+      MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(1), PtrRC, Register());
   BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
       .addReg(Mips::K1)
       .addImm(0);
@@ -795,7 +786,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     bool IsKill = !IsRAAndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC,
                             Register());
   }
 
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index b3aa5ac1ac1b5..a1d0aa089c089 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -209,7 +209,6 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       int64_t Offset,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -235,16 +234,16 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC164;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::ST_B;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) ||
+           RI.isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::ST_H;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) ||
+           RI.isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::ST_W;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) ||
+           RI.isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::ST_D;
   else if (Mips::LO32RegClass.hasSubClassEq(RC))
     Opc = Mips::SW;
@@ -281,10 +280,12 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB,
     .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
 }
 
-void MipsSEInstrInfo::loadRegFromStack(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    int64_t Offset, MachineInstr::MIFlag Flags) const {
+void MipsSEInstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       int64_t Offset,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
@@ -313,16 +314,16 @@ void MipsSEInstrInfo::loadRegFromStack(
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC164;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::LD_B;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) ||
+           RI.isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::LD_H;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) ||
+           RI.isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::LD_W;
-  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
-           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
+  else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) ||
+           RI.isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::LD_D;
   else if (Mips::HI32RegClass.hasSubClassEq(RC))
     Opc = Mips::LW;
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index 0a7a0e5ac2a56..5c48ccdc27f02 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -50,13 +50,12 @@ class MipsSEInstrInfo : public MipsInstrInfo {
   void storeRegToStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStack(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, int64_t Offset,
+      int FrameIndex, const TargetRegisterClass *RC, int64_t Offset,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 910bc9d281259..aae3e49f6c70b 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2520,11 +2520,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
         // saved vector registers.
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
-          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
-                                       I.getFrameIdx(), RC, TRI);
+          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(),
+                                       RC);
         else
           TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
-                                  TRI, Register());
+                                  Register());
       }
     }
   }
@@ -2690,10 +2690,9 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
         // saved vector registers.
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
-          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
-                                        TRI);
+          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC);
         else
-          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
+          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
                                    Register());
 
         assert(I != MBB.begin() &&
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 8d9d4c7f8e128..366a7b6d0135a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2014,8 +2014,7 @@ void PPCInstrInfo::StoreRegToStackSlot(
 
 void PPCInstrInfo::storeRegToStackSlotNoUpd(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI) const {
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr *, 4> NewMIs;
 
@@ -2034,8 +2033,7 @@ void PPCInstrInfo::storeRegToStackSlotNoUpd(
 
 void PPCInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   // We need to avoid a situation in which the value from a VRRC register is
   // spilled using an Altivec instruction and reloaded into a VSRC register
@@ -2045,7 +2043,7 @@ void PPCInstrInfo::storeRegToStackSlot(
   // the register is defined using an Altivec instruction and is then used by a
   // VSX instruction.
   RC = updatedRC(RC);
-  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI);
+  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC);
 }
 
 void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
@@ -2060,8 +2058,7 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
 
 void PPCInstrInfo::loadRegFromStackSlotNoUpd(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg,
-    int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI) const {
+    int FrameIdx, const TargetRegisterClass *RC) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
@@ -2080,10 +2077,12 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd(
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-void PPCInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   // We need to avoid a situation in which the value from a VRRC register is
   // spilled using an Altivec instruction and reloaded into a VSRC register
   // using a VSX instruction. The issue with this is that the VSX
@@ -2093,7 +2092,7 @@ void PPCInstrInfo::loadRegFromStackSlot(
   // VSX instruction.
   RC = updatedRC(RC);
 
-  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI);
+  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC);
 }
 
 bool PPCInstrInfo::
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index d67fc28935586..8b824bc219ab2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -570,7 +570,8 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Emits a register spill without updating the register class for vector
@@ -579,13 +580,13 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
                                 unsigned SrcReg, bool isKill, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI) const;
+                                const TargetRegisterClass *RC) const;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Emits a register reload without updating the register class for vector
@@ -594,8 +595,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  unsigned DestReg, int FrameIndex,
-                                 const TargetRegisterClass *RC,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterClass *RC) const;
 
   unsigned getStoreOpcodeForSpill(const TargetRegisterClass *RC) const;
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index f881c4c79d444..f7fc9528920a6 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -291,12 +291,12 @@ static void emitSiFiveCLICPreemptibleSaves(MachineFunction &MF,
   // which affects other passes.
   TII->storeRegToStackSlot(MBB, MBBI, RISCV::X8, /* IsKill=*/true,
                            RVFI->getInterruptCSRFrameIndex(0),
-                           &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                           Register(), MachineInstr::FrameSetup);
+                           &RISCV::GPRRegClass, Register(),
+                           MachineInstr::FrameSetup);
   TII->storeRegToStackSlot(MBB, MBBI, RISCV::X9, /* IsKill=*/true,
                            RVFI->getInterruptCSRFrameIndex(1),
-                           &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                           Register(), MachineInstr::FrameSetup);
+                           &RISCV::GPRRegClass, Register(),
+                           MachineInstr::FrameSetup);
 
   // Put `mcause` into X8 (s0), and `mepc` into X9 (s1). If either of these are
   // used in the function, then they will appear in `getUnmanagedCSI` and will
@@ -357,14 +357,12 @@ static void emitSiFiveCLICPreemptibleRestores(MachineFunction &MF,
 
   // X8 and X9 need to be restored to their values on function entry, which we
   // saved onto the stack in `emitSiFiveCLICPreemptibleSaves`.
-  TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X9,
-                            RVFI->getInterruptCSRFrameIndex(1),
-                            &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                            Register(), MachineInstr::FrameSetup);
-  TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X8,
-                            RVFI->getInterruptCSRFrameIndex(0),
-                            &RISCV::GPRRegClass, STI.getRegisterInfo(),
-                            Register(), MachineInstr::FrameSetup);
+  TII->loadRegFromStackSlot(
+      MBB, MBBI, RISCV::X9, RVFI->getInterruptCSRFrameIndex(1),
+      &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup);
+  TII->loadRegFromStackSlot(
+      MBB, MBBI, RISCV::X8, RVFI->getInterruptCSRFrameIndex(0),
+      &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup);
 }
 
 // Get the ID of the libcall used for spilling and restoring callee saved
@@ -2177,7 +2175,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
       MCRegister Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg),
-                              CS.getFrameIdx(), RC, TRI, Register(),
+                              CS.getFrameIdx(), RC, Register(),
                               MachineInstr::FrameSetup);
     }
   };
@@ -2267,8 +2265,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
     for (auto &CS : CSInfo) {
       MCRegister Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                               Register(), MachineInstr::FrameDestroy);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register(),
+                               MachineInstr::FrameDestroy);
       assert(MI != MBB.begin() &&
              "loadRegFromStackSlot didn't insert any code!");
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a3ccbd8d4a8aa..4d86a365a5939 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22203,8 +22203,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
 
   MachineFunction &MF = *BB->getParent();
   DebugLoc DL = MI.getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   Register LoReg = MI.getOperand(0).getReg();
   Register HiReg = MI.getOperand(1).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
@@ -22213,7 +22212,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
   int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
 
   TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
-                          RI, Register());
+                          Register());
   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
   MachineMemOperand *MMOLo =
       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8));
@@ -22239,8 +22238,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
 
   MachineFunction &MF = *BB->getParent();
   DebugLoc DL = MI.getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const RISCVInstrInfo &TII = *MF.getSubtarget<RISCVSubtarget>().getInstrInfo();
   Register DstReg = MI.getOperand(0).getReg();
   Register LoReg = MI.getOperand(1).getReg();
   Register HiReg = MI.getOperand(2).getReg();
@@ -22263,7 +22261,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
       .addFrameIndex(FI)
       .addImm(4)
       .addMemOperand(MMOHi);
-  TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI, Register());
+  TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, Register());
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ce8dd3ba4b7b3..e0cdd1186c5c1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -639,7 +639,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          Register SrcReg, bool IsKill, int FI,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
@@ -647,8 +646,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
-    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
-             RISCV::SW : RISCV::SD;
+    Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW
+                                                                : RISCV::SD;
   } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::SH_INX;
   } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) {
@@ -705,7 +704,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addFrameIndex(FI)
         .addMemOperand(MMO)
         .setMIFlag(Flags);
-    NumVRegSpilled += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
+    NumVRegSpilled += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
@@ -720,10 +719,12 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   }
 }
 
-void RISCVInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          Register DstReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          Register VReg,
+                                          MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   DebugLoc DL =
@@ -731,8 +732,8 @@ void RISCVInstrInfo::loadRegFromStackSlot(
 
   unsigned Opcode;
   if (RISCV::GPRRegClass.hasSubClassEq(RC)) {
-    Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ?
-             RISCV::LW : RISCV::LD;
+    Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW
+                                                                : RISCV::LD;
   } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) {
     Opcode = RISCV::LH_INX;
   } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) {
@@ -788,7 +789,7 @@ void RISCVInstrInfo::loadRegFromStackSlot(
         .addFrameIndex(FI)
         .addMemOperand(MMO)
         .setMIFlag(Flags);
-    NumVRegReloaded += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
+    NumVRegReloaded += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock;
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
@@ -1379,14 +1380,14 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
       report_fatal_error("underestimated function size");
 
     storeRegToStackSlot(MBB, MI, TmpGPR, /*IsKill=*/true, FrameIndex,
-                        &RISCV::GPRRegClass, TRI, Register());
+                        &RISCV::GPRRegClass, Register());
     TRI->eliminateFrameIndex(std::prev(MI.getIterator()),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
 
     MI.getOperand(1).setMBB(&RestoreBB);
 
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), TmpGPR, FrameIndex,
-                         &RISCV::GPRRegClass, TRI, Register());
+                         &RISCV::GPRRegClass, Register());
     TRI->eliminateFrameIndex(RestoreBB.back(),
                              /*SpAdj=*/0, /*FIOperandNum=*/1);
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 800af2621b7df..0ffe015b9fac8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -116,13 +116,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   using TargetInstrInfo::foldMemoryOperandImpl;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index fcd6cd7f262bc..6596379061e60 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -527,7 +527,6 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          Register SrcReg, bool isKill, int FI,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -564,10 +563,12 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     llvm_unreachable("Can't store this register to stack slot");
 }
 
-void SparcInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void SparcInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          Register DestReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          Register VReg,
+                                          MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index 01d0204734943..273888f427992 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -92,14 +92,13 @@ class SparcInstrInfo : public SparcGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   Register getGlobalBaseReg(MachineFunction *MF) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index dcefff99db25b..570bbd884a244 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -360,12 +360,12 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::FP64BitRegClass, TRI, Register());
+                               &SystemZ::FP64BitRegClass, Register());
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::VR128BitRegClass, TRI, Register());
+                               &SystemZ::VR128BitRegClass, Register());
     }
   }
 
@@ -389,10 +389,10 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
     MCRegister Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::FP64BitRegClass, TRI, Register());
+                                &SystemZ::FP64BitRegClass, Register());
     if (SystemZ::VR128BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::VR128BitRegClass, TRI, Register());
+                                &SystemZ::VR128BitRegClass, Register());
   }
 
   // Restore call-saved GPRs (but not call-clobbered varargs, which at
@@ -1157,12 +1157,12 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::FP64BitRegClass, TRI, Register());
+                               &SystemZ::FP64BitRegClass, Register());
     }
     if (SystemZ::VR128BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
-                               &SystemZ::VR128BitRegClass, TRI, Register());
+                               &SystemZ::VR128BitRegClass, Register());
     }
   }
 
@@ -1189,10 +1189,10 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters(
     MCRegister Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::FP64BitRegClass, TRI, Register());
+                                &SystemZ::FP64BitRegClass, Register());
     if (SystemZ::VR128BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
-                                &SystemZ::VR128BitRegClass, TRI, Register());
+                                &SystemZ::VR128BitRegClass, Register());
   }
 
   // Restore call-saved GPRs (but not call-clobbered varargs, which at
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 23e7e7e314033..eb1ce4a2101d7 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1023,8 +1023,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void SystemZInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
@@ -1036,10 +1036,12 @@ void SystemZInstrInfo::storeRegToStackSlot(
                     FrameIdx);
 }
 
-void SystemZInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            Register DestReg, int FrameIdx,
+                                            const TargetRegisterClass *RC,
+                                            Register VReg,
+                                            MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Callers may expect a single instruction, so keep 128-bit moves
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 7b9ad7b87a14f..4aecdd7498018 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -281,12 +281,14 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
                                       LiveIntervals *LIS) const override;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index bae703bd97ac8..b9ac5d6254362 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -459,7 +459,6 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
                                       Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI,
                                       Register VReg,
                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -519,10 +518,12 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     report_fatal_error("Can't store this register to stack slot");
 }
 
-void VEInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       Register VReg,
+                                       MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 408d3ab9e05f5..cedf7f21011ff 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -92,13 +92,15 @@ class VEInstrInfo : public VEGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
   /// } Stack Spill & Reload
 
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index 06f729a7e0cdc..25799f4ac0ea0 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before,
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   // Don't need shape information for tile store, becasue it is adjacent to
   // the tile def instruction.
-  TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI,
-                           Register());
+  TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register());
   ++NumStores;
 
   // TODO: update DBG_VALUEs
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index a66a3213403b4..8bca6344d6521 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
 
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
-                            Register(), MachineInstr::FrameSetup);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(),
+                            MachineInstr::FrameSetup);
   }
 
   return true;
@@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
       VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register());
   }
 
   // Clear the stack slot for spill base pointer register.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ae966427cf1f8..61d9608160197 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4780,14 +4780,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
 void X86InstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
          "Stack slot too small for store");
 
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+  unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -4801,15 +4801,17 @@ void X86InstrInfo::storeRegToStackSlot(
         .setMIFlag(Flags);
 }
 
-void X86InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        Register VReg,
+                                        MachineInstr::MIFlag Flags) const {
   const MachineFunction &MF = *MBB.getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
          "Load size exceeds stack slot");
-  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+  unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
@@ -5551,7 +5553,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
         return false;
       ShouldUpdateCC = true;
     } else if (ImmDelta != 0) {
-      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
+      unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
       // Shift amount for min/max constants to adjust for 8/16/32 instruction
       // sizes.
       switch (OldCC) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5545e39d1f6e5..a547fcd421411 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -467,14 +467,14 @@ class X86InstrInfo final : public X86GenInstrInfo {
                    bool RenamableSrc = false) const override;
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
       int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index cdb5186d23d3c..351a221c92ebd 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -432,7 +432,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters(
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC,
                             Register());
     if (emitFrameMoves) {
       auto Store = MI;
@@ -458,8 +458,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters(
            "LR & FP are always handled in emitEpilogue");
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() &&
            "loadRegFromStackSlot didn't insert any code!");
     // Insert in reverse order.  loadRegFromStackSlot can insert multiple
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index 80fda3480aa44..075910c84fb84 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -355,8 +355,8 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void XCoreInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
     bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end() && !I->isDebugInstr())
     DL = I->getDebugLoc();
@@ -377,7 +377,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
                                           Register DestReg, int FrameIndex,
                                           const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI,
                                           Register VReg,
                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 3543392653786..c4e399ebd3fd8 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -71,13 +71,15 @@ class XCoreInstrInfo : public XCoreGenInstrInfo {
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
       bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
       int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool reverseBranchCondition(
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
index cf9a2a052978d..1c0dc66a46144 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -314,7 +314,7 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters(
     bool IsKill = !IsA0AndRetAddrIsTaken;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII.storeRegToStackSlot(EntryBlock, MI, Reg, IsKill, CSI[i].getFrameIdx(),
-                            RC, TRI, Register());
+                            RC, Register());
   }
 
   return true;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index 6bbebdecd9988..d7b05acea9411 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -145,8 +145,8 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 void XtensaInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
-    MachineInstr::MIFlag Flags) const {
+
+    Register VReg, MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned LoadOpcode, StoreOpcode;
   getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx);
@@ -155,10 +155,12 @@ void XtensaInstrInfo::storeRegToStackSlot(
   addFrameReference(MIB, FrameIdx);
 }
 
-void XtensaInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
-    int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void XtensaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           Register DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           Register VReg,
+                                           MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned LoadOpcode, StoreOpcode;
   getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx);
@@ -544,12 +546,12 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
           "function code size is significantly larger than estimated");
 
     storeRegToStackSlot(MBB, L32R, ScavRegister, /*IsKill=*/true, FrameIndex,
-                        &Xtensa::ARRegClass, &RI, Register());
+                        &Xtensa::ARRegClass, Register());
     RI.eliminateFrameIndex(std::prev(L32R.getIterator()),
                            /*SpAdj=*/0, /*FIOperandNum=*/1);
 
     loadRegFromStackSlot(RestoreBB, RestoreBB.end(), ScavRegister, FrameIndex,
-                         &Xtensa::ARRegClass, &RI, Register());
+                         &Xtensa::ARRegClass, Register());
     RI.eliminateFrameIndex(RestoreBB.back(),
                            /*SpAdj=*/0, /*FIOperandNum=*/1);
     JumpToMBB = &RestoreBB;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
index 1808cb36d8a9b..0b46d6ce2fdb7 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h
@@ -56,14 +56,13 @@ class XtensaInstrInfo : public XtensaGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
-      bool isKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
       Register DestReg, int FrameIdx, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   // Get the load and store opcodes for a given register class and offset.

From dd75d3014005967fc7070d6c9c0abdacb940c035 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Tue, 11 Nov 2025 08:27:25 +0800
Subject: [PATCH 41/97] [X86][NFC] Change check from MIR to assemble for non
 EGPR instructions (#167275)

---
 llvm/test/CodeGen/X86/apx/no-rex2-general.ll  | 122 +++++++++---------
 .../CodeGen/X86/apx/no-rex2-pseudo-amx.ll     |  29 +++--
 .../CodeGen/X86/apx/no-rex2-pseudo-x87.ll     |  31 +++--
 llvm/test/CodeGen/X86/apx/no-rex2-special.ll  | 113 ++++++++--------
 4 files changed, 159 insertions(+), 136 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
index 805fc7ccaab76..2b34739fa80e3 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
@@ -1,76 +1,80 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) {
-  ; SSE-LABEL: name: map0
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $rdi, $rsi
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
-  ; SSE-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-  ; SSE-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
-  ; SSE-NEXT:   $eax = COPY [[MOV32rm]]
-  ; SSE-NEXT:   RET 0, $eax
-  ; AVX-LABEL: name: map0
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $rdi, $rsi
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
-  ; AVX-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
-  ; AVX-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
-  ; AVX-NEXT:   $eax = COPY [[MOV32rm]]
-  ; AVX-NEXT:   RET 0, $eax
+; CHECK-LABEL: map0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %r16
+; CHECK-NEXT:    movq %rdi, %r17
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl (%r17,%r16,4), %eax
+; CHECK-NEXT:    retq
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %0 = load i32, ptr %add.ptr
   ret i32 %0
 }
 
-define i32 @map1_or_vex(<2 x double> noundef %a) {
-  ; SSE-LABEL: name: map1_or_vex
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $xmm0
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
-  ; SSE-NEXT:   [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr
-  ; SSE-NEXT:   $eax = COPY [[CVTSD2SIrr_Int]]
-  ; SSE-NEXT:   RET 0, $eax
-  ; AVX-LABEL: name: map1_or_vex
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $xmm0
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
-  ; AVX-NEXT:   [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr
-  ; AVX-NEXT:   $eax = COPY [[VCVTSD2SIrr_Int]]
-  ; AVX-NEXT:   RET 0, $eax
+define i32 @map1_or_vex(<2 x double> noundef %a) nounwind {
+; SSE-LABEL: map1_or_vex:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtsd2si %xmm0, %r16d
+; SSE-NEXT:    #APP
+; SSE-NEXT:    nop
+; SSE-NEXT:    #NO_APP
+; SSE-NEXT:    movl %r16d, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: map1_or_vex:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    vcvtsd2si %xmm0, %ebx
+; AVX-NEXT:    #APP
+; AVX-NEXT:    nop
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    movl %ebx, %eax
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
 entry:
   %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a)
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   ret i32 %0
 }
 
-define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) {
-  ; SSE-LABEL: name: map2_or_vex
-  ; SSE: bb.0.entry:
-  ; SSE-NEXT:   liveins: $rdi, $rsi
-  ; SSE-NEXT: {{  $}}
-  ; SSE-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
-  ; SSE-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; SSE-NEXT:   [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
-  ; SSE-NEXT:   $xmm0 = COPY [[PABSBrm]]
-  ; SSE-NEXT:   RET 0, $xmm0
-  ; AVX-LABEL: name: map2_or_vex
-  ; AVX: bb.0.entry:
-  ; AVX-NEXT:   liveins: $rdi, $rsi
-  ; AVX-NEXT: {{  $}}
-  ; AVX-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
-  ; AVX-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; AVX-NEXT:   [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
-  ; AVX-NEXT:   $xmm0 = COPY [[VPABSBrm]]
-  ; AVX-NEXT:   RET 0, $xmm0
+define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind {
+; SSE-LABEL: map2_or_vex:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rsi, %rbx
+; SSE-NEXT:    movq %rdi, %r14
+; SSE-NEXT:    #APP
+; SSE-NEXT:    nop
+; SSE-NEXT:    #NO_APP
+; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: map2_or_vex:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    movq %rsi, %rbx
+; AVX-NEXT:    movq %rdi, %r14
+; AVX-NEXT:    #APP
+; AVX-NEXT:    nop
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c
   %a = load <2 x i64>, ptr %add.ptr
   %0 = bitcast <2 x i64> %a to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
index 5fa4cb4c8826b..c193680607f76 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
@@ -1,17 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s
 
-define dso_local void @amx(ptr noundef %data) {
-  ; CHECK-LABEL: name: amx
-  ; CHECK: bb.0.entry:
-  ; CHECK-NEXT:   liveins: $rdi
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8
-  ; CHECK-NEXT:   PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg
-  ; CHECK-NEXT:   RET 0
-  entry:
+define dso_local void @amx(ptr noundef %data) nounwind {
+; CHECK-LABEL: amx:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl $8, %eax
+; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
index a9ca591a156c2..4692a58d095a6 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
@@ -1,17 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s
 
-define void @x87(ptr %0, ptr %1) {
-  ; CHECK-LABEL: name: x87
-  ; CHECK: bb.0 (%ir-block.2):
-  ; CHECK-NEXT:   liveins: $rdi, $rsi
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0)
-  ; CHECK-NEXT:   nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1)
-  ; CHECK-NEXT:   RET 0
+define void @x87(ptr %0, ptr %1) nounwind {
+; CHECK-LABEL: x87:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    flds (%r14)
+; CHECK-NEXT:    fstps (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %3 = load float, ptr %0
   store float %3, ptr %1
   ret void
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
index 86534427a9eae..f2025b5c8cbf8 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
@@ -1,70 +1,81 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr  | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr  | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr  | FileCheck %s
 
-define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xsave
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xsave (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xsave(ptr, i32, i32)
 
-define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xsave64
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xsave64 (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xsave64(ptr, i32, i32)
 
-define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xrstor
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xrstor (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }
 declare void @llvm.x86.xrstor(ptr, i32, i32)
 
-define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) {
-  ; CHECK-LABEL: name: test_xrstor64
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $rdi, $esi, $edx
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edx
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $esi
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
-  ; CHECK-NEXT:   $edx = COPY [[COPY1]]
-  ; CHECK-NEXT:   $eax = COPY [[COPY]]
-  ; CHECK-NEXT:   XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
-  ; CHECK-NEXT:   RET 0
+define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edx, %r16d
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %r16d, %eax
+; CHECK-NEXT:    xrstor64 (%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
 }

From 4369d52c282ca42ef43df6548aa097e51e319a99 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 10 Nov 2025 16:29:13 -0800
Subject: [PATCH 42/97] workflows/llvm-abi-tests: Trigger tests on updates to
 the file (#167413)

Also remove deleted file from the file list.
---
 .github/workflows/llvm-abi-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index b0c2d32d4a41b..1ea134382ef6f 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -10,13 +10,13 @@ on:
       - 'release/**'
     paths:
       - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
+      - '.github/workflows/llvm-abi-tests.yml'
   pull_request:
     branches:
       - 'release/**'
     paths:
       - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
+      - '.github/workflows/llvm-abi-tests.yml'
 
 concurrency:
   # Skip intermediate builds: always.

From 9c7fe126f48f87c3cf73f4bc83a47820b320ed48 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 10 Nov 2025 16:48:19 -0800
Subject: [PATCH 43/97] Revert "[scudo] Small cleanup of memory tagging code."
 (#167425)

Reverts llvm/llvm-project#166860

The local static variable causes build failures.
---
 compiler-rt/lib/scudo/standalone/combined.h                | 3 ++-
 compiler-rt/lib/scudo/standalone/memtag.h                  | 7 ++++---
 compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp     | 1 +
 compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp  | 4 +++-
 .../lib/scudo/standalone/tests/wrappers_cpp_test.cpp       | 3 ++-
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 2d0f4f53d2ac5..ffe9554203241 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -171,7 +171,8 @@ class Allocator {
       Primary.Options.set(OptionBit::DeallocTypeMismatch);
     if (getFlags()->delete_size_mismatch)
       Primary.Options.set(OptionBit::DeleteSizeMismatch);
-    if (systemSupportsMemoryTagging())
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>() &&
+        systemSupportsMemoryTagging())
       Primary.Options.set(OptionBit::UseMemoryTagging);
 
     QuarantineMaxChunkSize =
diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h
index 52897b3dca6ae..83ebe676433eb 100644
--- a/compiler-rt/lib/scudo/standalone/memtag.h
+++ b/compiler-rt/lib/scudo/standalone/memtag.h
@@ -66,8 +66,7 @@ inline bool systemSupportsMemoryTagging() {
 #ifndef HWCAP2_MTE
 #define HWCAP2_MTE (1 << 18)
 #endif
-  static bool SupportsMemoryTagging = getauxval(AT_HWCAP2) & HWCAP2_MTE;
-  return SupportsMemoryTagging;
+  return getauxval(AT_HWCAP2) & HWCAP2_MTE;
 }
 
 inline bool systemDetectsMemoryTagFaultsTestOnly() {
@@ -262,7 +261,9 @@ inline uptr loadTag(uptr Ptr) {
 
 #else
 
-inline bool systemSupportsMemoryTagging() { return false; }
+inline NORETURN bool systemSupportsMemoryTagging() {
+  UNREACHABLE("memory tagging not supported");
+}
 
 inline NORETURN bool systemDetectsMemoryTagFaultsTestOnly() {
   UNREACHABLE("memory tagging not supported");
diff --git a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
index d0d93316f212e..09093e11452dd 100644
--- a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp
@@ -28,6 +28,7 @@ TEST(MemtagBasicDeathTest, Unsupported) {
   EXPECT_DEATH(untagPointer((uptr)0), "not supported");
   EXPECT_DEATH(extractTag((uptr)0), "not supported");
 
+  EXPECT_DEATH(systemSupportsMemoryTagging(), "not supported");
   EXPECT_DEATH(systemDetectsMemoryTagFaultsTestOnly(), "not supported");
   EXPECT_DEATH(enableSystemMemoryTaggingTestOnly(), "not supported");
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 8741c8299b57c..855a3e6e6109f 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -27,7 +27,9 @@
 const scudo::uptr PageSize = scudo::getPageSizeCached();
 
 template <typename Config> static scudo::Options getOptionsForConfig() {
-  if (!scudo::systemSupportsMemoryTagging())
+  if (!Config::getMaySupportMemoryTagging() ||
+      !scudo::archSupportsMemoryTagging() ||
+      !scudo::systemSupportsMemoryTagging())
     return {};
   scudo::AtomicOptions AO;
   AO.set(scudo::OptionBit::UseMemoryTagging);
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
index 84359251a02aa..c802ed22fbad0 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
@@ -187,7 +187,8 @@ TEST_F(ScudoWrappersCppTest, ThreadedNew) {
   // TODO: Investigate why libc sometimes crashes with tag missmatch in
   // __pthread_clockjoin_ex.
   std::unique_ptr<scudo::ScopedDisableMemoryTagChecks> NoTags;
-  if (!SCUDO_ANDROID && scudo::systemSupportsMemoryTagging())
+  if (!SCUDO_ANDROID && scudo::archSupportsMemoryTagging() &&
+      scudo::systemSupportsMemoryTagging())
     NoTags = std::make_unique<scudo::ScopedDisableMemoryTagChecks>();
 
   Ready = false;

From 70d379fd7a1e3ab54e4afc46e68411daa1013550 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 10 Nov 2025 16:54:44 -0800
Subject: [PATCH 44/97] Revert "[mlir] Add FP software implementation lowering
 pass: `arith-to-apfloat` (#166618)" (#167429)

This reverts commit 222f4e494a0cd9515c242fd083c2776772734385.
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 ---
 mlir/include/mlir/Conversion/Passes.h         |   1 -
 mlir/include/mlir/Conversion/Passes.td        |  15 --
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 -
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 -
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 161 ------------------
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  17 --
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 -
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 --
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 ---
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 --
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  81 ---------
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  12 --
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 --------------
 .../Arith/CPU/test-apfloat-emulation.mlir     |  34 ----
 16 files changed, 533 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 delete mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 delete mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
deleted file mode 100644
index 64a42a228199e..0000000000000
--- a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
-//
-// Part of the APFloat Project, under the Apache License v2.0 with APFloat
-// Exceptions. See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-
-#include <memory>
-
-namespace mlir {
-class Pass;
-
-#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 82bdfd02661a6..40d866ec7bf10 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,7 +12,6 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 79bc380dbcb7a..70e3e45c225db 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,21 +186,6 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// ArithToAPFloat
-//===----------------------------------------------------------------------===//
-
-def ArithToAPFloatConversionPass
-    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
-  let summary = "Convert Arith ops to APFloat runtime library calls";
-  let description = [{
-    This pass converts supported Arith ops to APFloat-based runtime library
-    calls (APFloatWrappers.cpp). APFloat is a software implementation of
-    floating-point arithmetic operations.
-  }];
-  let dependentDialects = ["func::FuncDialect"];
-}
-
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 00d50874a2e8d..3576126a487ac 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,13 +60,6 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
-/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
-/// `name`. Return a failure if the FuncOp is found but with a different
-/// signature.
-FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                               FunctionType funcT,
-                               SymbolTableCollection *symbolTables = nullptr);
-
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index b09d32022e348..8ad9ed18acebd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,10 +52,6 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
-FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                             SymbolTableCollection *symbolTables = nullptr);
-
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
deleted file mode 100644
index 012e934d3050f..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Utils/Utils.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "arith-to-apfloat"
-
-namespace mlir {
-#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::func;
-
-static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
-                           StringRef name, FunctionType funcT, bool setPrivate,
-                           SymbolTableCollection *symbolTables = nullptr) {
-  OpBuilder::InsertionGuard g(b);
-  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
-  b.setInsertionPointToStart(&symTable->getRegion(0).front());
-  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
-  if (setPrivate)
-    funcOp.setPrivate();
-  if (symbolTables) {
-    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
-    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
-  }
-  return funcOp;
-}
-
-/// Helper function to look up or create the symbol for a runtime library
-/// function for a binary arithmetic operation.
-///
-/// Parameter 1: APFloat semantics
-/// Parameter 2: Left-hand side operand
-/// Parameter 3: Right-hand side operand
-///
-/// This function will return a failure if the function is found but has an
-/// unexpected signature.
-///
-static FailureOr<FuncOp>
-lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
-                       SymbolTableCollection *symbolTables = nullptr) {
-  auto i32Type = IntegerType::get(symTable->getContext(), 32);
-  auto i64Type = IntegerType::get(symTable->getContext(), 64);
-
-  std::string funcName = (llvm::Twine("__mlir_apfloat_") + name).str();
-  FunctionType funcT =
-      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
-  FailureOr<FuncOp> func =
-      lookupFnDecl(symTable, funcName, funcT, symbolTables);
-  // Failed due to type mismatch.
-  if (failed(func))
-    return func;
-  // Successfully matched existing decl.
-  if (*func)
-    return *func;
-
-  return createFnDecl(b, symTable, funcName, funcT,
-                      /*setPrivate=*/true, symbolTables);
-}
-
-/// Rewrite a binary arithmetic operation to an APFloat function call.
-template <typename OpTy, const char *APFloatName>
-struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
-  BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit,
-                                   SymbolOpInterface symTable)
-      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {};
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Get APFloat function from runtime library.
-    FailureOr<FuncOp> fn =
-        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
-    if (failed(fn))
-      return fn;
-
-    rewriter.setInsertionPoint(op);
-    // Cast operands to 64-bit integers.
-    Location loc = op.getLoc();
-    auto floatTy = cast<FloatType>(op.getType());
-    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
-    auto int64Type = rewriter.getI64Type();
-    Value lhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
-    Value rhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
-
-    // Call APFloat function.
-    int32_t sem =
-        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-    Value semValue = arith::ConstantOp::create(
-        rewriter, loc, rewriter.getI32Type(),
-        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
-    auto resultOp =
-        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
-                             SymbolRefAttr::get(*fn), params);
-
-    // Truncate result to the original width.
-    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
-                                                  resultOp->getResult(0));
-    rewriter.replaceOp(
-        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
-    return success();
-  }
-
-  SymbolOpInterface symTable;
-};
-
-namespace {
-struct ArithToAPFloatConversionPass final
-    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(context);
-    static const char add[] = "add";
-    static const char subtract[] = "subtract";
-    static const char multiply[] = "multiply";
-    static const char divide[] = "divide";
-    static const char remainder[] = "remainder";
-    patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp, add>,
-                 BinaryArithOpToAPFloatConversion<arith::SubFOp, subtract>,
-                 BinaryArithOpToAPFloatConversion<arith::MulFOp, multiply>,
-                 BinaryArithOpToAPFloatConversion<arith::DivFOp, divide>,
-                 BinaryArithOpToAPFloatConversion<arith::RemFOp, remainder>>(
-        context, 1, getOperation());
-    LogicalResult result = success();
-    ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
-      if (diag.getSeverity() == DiagnosticSeverity::Error) {
-        result = failure();
-      }
-      // NB: if you don't return failure, no other diag handlers will fire (see
-      // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
-      return failure();
-    });
-    walkAndApplyPatterns(getOperation(), std::move(patterns));
-    if (failed(result))
-      return signalPassFailure();
-  }
-};
-} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
deleted file mode 100644
index b0d1e46b3655f..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-add_mlir_conversion_library(MLIRArithToAPFloat
-  ArithToAPFloat.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
-
-  DEPENDS
-  MLIRConversionPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRArithDialect
-  MLIRArithTransforms
-  MLIRFuncDialect
-  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..b6099902cc337 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 613dc6d242ceb..bebf1b8fff3f9 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
-add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index c747e1b59558a..69a317ecd101f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,20 +1654,6 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
-    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
-      // Print other floating-point types using the APFloat runtime library.
-      int32_t sem =
-          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-      Value semValue = LLVM::ConstantOp::create(
-          rewriter, loc, rewriter.getI32Type(),
-          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-      Value floatBits =
-          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
-      printer =
-          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
-      emitCall(rewriter, loc, printer.value(),
-               ValueRange({semValue, floatBits}));
-      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index d6dfd0229963c..b4cb0932ef631 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,28 +254,3 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
-
-FailureOr<func::FuncOp>
-func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                   FunctionType funcT, SymbolTableCollection *symbolTables) {
-  FuncOp func;
-  if (symbolTables) {
-    func = symbolTables->lookupSymbolIn<FuncOp>(
-        symTable, StringAttr::get(symTable->getContext(), name));
-  } else {
-    func = llvm::dyn_cast_or_null<FuncOp>(
-        SymbolTable::lookupSymbolIn(symTable, name));
-  }
-
-  if (!func)
-    return func;
-
-  mlir::FunctionType foundFuncT = func.getFunctionType();
-  // Assert the signature of the found function is same as expected
-  if (funcT != foundFuncT) {
-    return func.emitError("matched function '")
-           << name << "' but with different type: " << foundFuncT
-           << " (expected " << funcT << ")";
-  }
-  return func;
-}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 160b6ae89215c..feaffa34897b6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,7 +30,6 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
-static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -161,16 +160,6 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
-FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                                         SymbolTableCollection *symbolTables) {
-  return lookupOrCreateReservedFn(
-      b, moduleOp, kPrintApFloat,
-      {IntegerType::get(moduleOp->getContext(), 32),
-       IntegerType::get(moduleOp->getContext(), 64)},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
-}
-
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
deleted file mode 100644
index 85ea0986cde5b..0000000000000
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file exposes the APFloat infrastructure to MLIR programs as a runtime
-// library. APFloat is a software implementation of floating point arithmetics.
-//
-// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
-// before calling a runtime function. If a floating-point type has less than
-// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
-// integer.
-//
-// Runtime functions receive the floating-point operands of the arithmeic
-// operation in the form of 64-bit integers, along with the APFloat semantics
-// in the form of a 32-bit integer, which will be interpreted as an
-// APFloatBase::Semantics enum value.
-//
-#include "llvm/ADT/APFloat.h"
-
-#if (defined(_WIN32) || defined(__CYGWIN__))
-#define MLIR_APFLOAT_WRAPPERS_EXPORTED __declspec(dllexport)
-#else
-#define MLIR_APFLOAT_WRAPPERS_EXPORTED __attribute__((visibility("default")))
-#endif
-
-/// Binary operations without rounding mode.
-#define APFLOAT_BINARY_OP(OP)                                                  \
-  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs);                                                               \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-/// Binary operations with rounding mode.
-#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
-  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs, ROUNDING_MODE);                                                \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-extern "C" {
-
-#define BIN_OPS_WITH_ROUNDING(X)                                               \
-  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
-  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(divide, llvm::RoundingMode::NearestTiesToEven)
-
-BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
-#undef BIN_OPS_WITH_ROUNDING
-#undef APFLOAT_BINARY_OP_ROUNDING_MODE
-
-APFLOAT_BINARY_OP(remainder)
-
-#undef APFLOAT_BINARY_OP
-
-void MLIR_APFLOAT_WRAPPERS_EXPORTED printApFloat(int32_t semantics,
-                                                 uint64_t a) {
-  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
-      static_cast<llvm::APFloatBase::Semantics>(semantics));
-  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
-  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
-  double d = x.convertToDouble();
-  fprintf(stdout, "%lg", d);
-}
-}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 8c09e50e4de7b..fdeb4dacf9278 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,7 +2,6 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
-  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -168,15 +167,6 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
-  add_mlir_library(mlir_apfloat_wrappers
-    SHARED
-    APFloatWrappers.cpp
-
-    EXCLUDE_FROM_LIBMLIR
-    )
-  set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17)
-  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
-
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -187,7 +177,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -202,7 +191,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
deleted file mode 100644
index fe4d28a56f808..0000000000000
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ /dev/null
@@ -1,128 +0,0 @@
-// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL:   func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
-
-// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
-// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
-// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
-// CHECK:         }
-
-// Illustrate that both f8E4M3FN and f6E3M2FN calling the same __mlir_apfloat_add is fine
-// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
-// CHECK-LABEL:   func.func @full_example() {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
-// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
-// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
-// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
-//                  // fltSemantics semantics for f8E4M3FN
-// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
-// CHECK:           %[[VAL_1:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
-// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
-// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
-
-// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
-// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
-// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
-// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
-//                  // fltSemantics semantics for f6E3M2FN
-// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
-// CHECK:           %[[VAL_3:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
-// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
-// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
-// CHECK:           return
-// CHECK:         }
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> f8E4M3FN {
-  %cst = arith.constant 2.2 : f8E4M3FN
-  return %cst : f8E4M3FN
-}
-
-func.func @bar() -> f6E3M2FN {
-  %cst = arith.constant 3.2 : f6E3M2FN
-  return %cst : f6E3M2FN
-}
-
-func.func @full_example() {
-  %a = arith.constant 1.4 : f8E4M3FN
-  %b = func.call @foo() : () -> (f8E4M3FN)
-  %c = arith.addf %a, %b : f8E4M3FN
-  vector.print %c : f8E4M3FN
-
-  %d = arith.constant 2.4 : f6E3M2FN
-  %e = func.call @bar() : () -> (f6E3M2FN)
-  %f = arith.addf %d, %e : f6E3M2FN
-  vector.print %f : f6E3M2FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// Test decl collision (different type)
-// expected-error@+1{{matched function '__mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
-func.func private @__mlir_apfloat_add(i32, i32, f32) -> index
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_subtract(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_multiply(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_divide(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_remainder(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
deleted file mode 100644
index a2b3eb73a60b8..0000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// Case 1: All floating-point arithmetics is lowered through APFloat.
-// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
-
-// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
-//         Arithmetics on f32 is lowered directly to LLVM.
-// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
-// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> (f8E4M3FN, f32) {
-  %cst1 = arith.constant 2.2 : f8E4M3FN
-  %cst2 = arith.constant 2.2 : f32
-  return %cst1, %cst2 : f8E4M3FN, f32
-}
-
-func.func @entry() {
-  %a1 = arith.constant 1.4 : f8E4M3FN
-  %a2 = arith.constant 1.4 : f32
-  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
-  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
-  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
-
-  // CHECK: 3.5
-  vector.print %c1 : f8E4M3FN
-
-  // CHECK: 3.6
-  vector.print %c2 : f32
-
-  return
-}

From 0e639ae6e030ade849fa7a09cb7dc40b42f25373 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 10 Nov 2025 17:13:29 -0800
Subject: [PATCH 45/97] Reapply "[mlir] Add FP software implementation lowering
 pass: `arith-to-apfloat` (#166618)" (#167431)

Reland https://github.com/llvm/llvm-project/pull/166618 with
MLIRFuncUtils linked in.
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 +++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |  15 ++
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 +
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 +
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 161 ++++++++++++++++++
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  18 ++
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 +
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 ++
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 +++
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 ++
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  81 +++++++++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  12 ++
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 ++++++++++++++
 .../Arith/CPU/test-apfloat-emulation.mlir     |  34 ++++
 16 files changed, 534 insertions(+)
 create mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 create mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 create mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 create mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 create mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
new file mode 100644
index 0000000000000..64a42a228199e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
@@ -0,0 +1,21 @@
+//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
+//
+// Part of the APFloat Project, under the Apache License v2.0 with APFloat
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 40d866ec7bf10..82bdfd02661a6 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 70e3e45c225db..79bc380dbcb7a 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,6 +186,21 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAPFloat
+//===----------------------------------------------------------------------===//
+
+def ArithToAPFloatConversionPass
+    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
+  let summary = "Convert Arith ops to APFloat runtime library calls";
+  let description = [{
+    This pass converts supported Arith ops to APFloat-based runtime library
+    calls (APFloatWrappers.cpp). APFloat is a software implementation of
+    floating-point arithmetic operations.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 3576126a487ac..00d50874a2e8d 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,6 +60,13 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
+/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
+/// `name`. Return a failure if the FuncOp is found but with a different
+/// signature.
+FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                               FunctionType funcT,
+                               SymbolTableCollection *symbolTables = nullptr);
+
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8ad9ed18acebd..b09d32022e348 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,6 +52,10 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
new file mode 100644
index 0000000000000..012e934d3050f
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -0,0 +1,161 @@
+//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "arith-to-apfloat"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::func;
+
+static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
+                           StringRef name, FunctionType funcT, bool setPrivate,
+                           SymbolTableCollection *symbolTables = nullptr) {
+  OpBuilder::InsertionGuard g(b);
+  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
+  b.setInsertionPointToStart(&symTable->getRegion(0).front());
+  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
+  if (setPrivate)
+    funcOp.setPrivate();
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
+    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
+  }
+  return funcOp;
+}
+
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+
+  std::string funcName = (llvm::Twine("__mlir_apfloat_") + name).str();
+  FunctionType funcT =
+      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  FailureOr<FuncOp> func =
+      lookupFnDecl(symTable, funcName, funcT, symbolTables);
+  // Failed due to type mismatch.
+  if (failed(func))
+    return func;
+  // Successfully matched existing decl.
+  if (*func)
+    return *func;
+
+  return createFnDecl(b, symTable, funcName, funcT,
+                      /*setPrivate=*/true, symbolTables);
+}
+
+/// Rewrite a binary arithmetic operation to an APFloat function call.
+template <typename OpTy, const char *APFloatName>
+struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
+  BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit,
+                                   SymbolOpInterface symTable)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {};
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    FailureOr<FuncOp> fn =
+        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    auto int64Type = rewriter.getI64Type();
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    int32_t sem =
+        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+    Value semValue = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
+namespace {
+struct ArithToAPFloatConversionPass final
+    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    static const char add[] = "add";
+    static const char subtract[] = "subtract";
+    static const char multiply[] = "multiply";
+    static const char divide[] = "divide";
+    static const char remainder[] = "remainder";
+    patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp, add>,
+                 BinaryArithOpToAPFloatConversion<arith::SubFOp, subtract>,
+                 BinaryArithOpToAPFloatConversion<arith::MulFOp, multiply>,
+                 BinaryArithOpToAPFloatConversion<arith::DivFOp, divide>,
+                 BinaryArithOpToAPFloatConversion<arith::RemFOp, remainder>>(
+        context, 1, getOperation());
+    LogicalResult result = success();
+    ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
+      if (diag.getSeverity() == DiagnosticSeverity::Error) {
+        result = failure();
+      }
+      // NB: if you don't return failure, no other diag handlers will fire (see
+      // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
+      return failure();
+    });
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+    if (failed(result))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
new file mode 100644
index 0000000000000..b5ec49c087163
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRArithToAPFloat
+  ArithToAPFloat.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncDialect
+  MLIRFuncUtils
+  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index b6099902cc337..f2bacc3399144 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index bebf1b8fff3f9..613dc6d242ceb 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 69a317ecd101f..c747e1b59558a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,6 +1654,20 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
+    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
+      // Print other floating-point types using the APFloat runtime library.
+      int32_t sem =
+          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+      Value semValue = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(),
+          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+      Value floatBits =
+          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
+      printer =
+          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
+      emitCall(rewriter, loc, printer.value(),
+               ValueRange({semValue, floatBits}));
+      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index b4cb0932ef631..d6dfd0229963c 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,3 +254,28 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
+
+FailureOr<func::FuncOp>
+func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                   FunctionType funcT, SymbolTableCollection *symbolTables) {
+  FuncOp func;
+  if (symbolTables) {
+    func = symbolTables->lookupSymbolIn<FuncOp>(
+        symTable, StringAttr::get(symTable->getContext(), name));
+  } else {
+    func = llvm::dyn_cast_or_null<FuncOp>(
+        SymbolTable::lookupSymbolIn(symTable, name));
+  }
+
+  if (!func)
+    return func;
+
+  mlir::FunctionType foundFuncT = func.getFunctionType();
+  // Assert the signature of the found function is same as expected
+  if (funcT != foundFuncT) {
+    return func.emitError("matched function '")
+           << name << "' but with different type: " << foundFuncT
+           << " (expected " << funcT << ")";
+  }
+  return func;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..160b6ae89215c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,6 +30,7 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
+static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -160,6 +161,16 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kPrintApFloat,
+      {IntegerType::get(moduleOp->getContext(), 32),
+       IntegerType::get(moduleOp->getContext(), 64)},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
+}
+
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
new file mode 100644
index 0000000000000..85ea0986cde5b
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -0,0 +1,81 @@
+//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the APFloat infrastructure to MLIR programs as a runtime
+// library. APFloat is a software implementation of floating point arithmetics.
+//
+// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
+// before calling a runtime function. If a floating-point type has less than
+// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
+// integer.
+//
+// Runtime functions receive the floating-point operands of the arithmeic
+// operation in the form of 64-bit integers, along with the APFloat semantics
+// in the form of a 32-bit integer, which will be interpreted as an
+// APFloatBase::Semantics enum value.
+//
+#include "llvm/ADT/APFloat.h"
+
+#if (defined(_WIN32) || defined(__CYGWIN__))
+#define MLIR_APFLOAT_WRAPPERS_EXPORTED __declspec(dllexport)
+#else
+#define MLIR_APFLOAT_WRAPPERS_EXPORTED __attribute__((visibility("default")))
+#endif
+
+/// Binary operations without rounding mode.
+#define APFLOAT_BINARY_OP(OP)                                                  \
+  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs);                                                               \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+/// Binary operations with rounding mode.
+#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
+  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs, ROUNDING_MODE);                                                \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+extern "C" {
+
+#define BIN_OPS_WITH_ROUNDING(X)                                               \
+  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
+  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(divide, llvm::RoundingMode::NearestTiesToEven)
+
+BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
+#undef BIN_OPS_WITH_ROUNDING
+#undef APFLOAT_BINARY_OP_ROUNDING_MODE
+
+APFLOAT_BINARY_OP(remainder)
+
+#undef APFLOAT_BINARY_OP
+
+void MLIR_APFLOAT_WRAPPERS_EXPORTED printApFloat(int32_t semantics,
+                                                 uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  double d = x.convertToDouble();
+  fprintf(stdout, "%lg", d);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index fdeb4dacf9278..8c09e50e4de7b 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -167,6 +168,15 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
+  add_mlir_library(mlir_apfloat_wrappers
+    SHARED
+    APFloatWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+    )
+  set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
+
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -177,6 +187,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -191,6 +202,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
new file mode 100644
index 0000000000000..fe4d28a56f808
--- /dev/null
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
+
+// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
+// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
+// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
+// CHECK:         }
+
+// Illustrate that both f8E4M3FN and f6E3M2FN calling the same __mlir_apfloat_add is fine
+// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
+// CHECK-LABEL:   func.func @full_example() {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
+// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
+// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
+// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
+//                  // fltSemantics semantics for f8E4M3FN
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_1:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
+// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
+// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
+
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
+// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
+// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
+// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
+//                  // fltSemantics semantics for f6E3M2FN
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
+// CHECK:           %[[VAL_3:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
+// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
+// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
+// CHECK:           return
+// CHECK:         }
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> f8E4M3FN {
+  %cst = arith.constant 2.2 : f8E4M3FN
+  return %cst : f8E4M3FN
+}
+
+func.func @bar() -> f6E3M2FN {
+  %cst = arith.constant 3.2 : f6E3M2FN
+  return %cst : f6E3M2FN
+}
+
+func.func @full_example() {
+  %a = arith.constant 1.4 : f8E4M3FN
+  %b = func.call @foo() : () -> (f8E4M3FN)
+  %c = arith.addf %a, %b : f8E4M3FN
+  vector.print %c : f8E4M3FN
+
+  %d = arith.constant 2.4 : f6E3M2FN
+  %e = func.call @bar() : () -> (f6E3M2FN)
+  %f = arith.addf %d, %e : f6E3M2FN
+  vector.print %f : f6E3M2FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// Test decl collision (different type)
+// expected-error@+1{{matched function '__mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
+func.func private @__mlir_apfloat_add(i32, i32, f32) -> index
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_subtract(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_multiply(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_divide(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @__mlir_apfloat_remainder(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @__mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
new file mode 100644
index 0000000000000..a2b3eb73a60b8
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -0,0 +1,34 @@
+// Case 1: All floating-point arithmetics is lowered through APFloat.
+// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
+
+// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
+//         Arithmetics on f32 is lowered directly to LLVM.
+// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
+// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> (f8E4M3FN, f32) {
+  %cst1 = arith.constant 2.2 : f8E4M3FN
+  %cst2 = arith.constant 2.2 : f32
+  return %cst1, %cst2 : f8E4M3FN, f32
+}
+
+func.func @entry() {
+  %a1 = arith.constant 1.4 : f8E4M3FN
+  %a2 = arith.constant 1.4 : f32
+  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
+
+  // CHECK: 3.5
+  vector.print %c1 : f8E4M3FN
+
+  // CHECK: 3.6
+  vector.print %c2 : f32
+
+  return
+}

From 2677c29d6dd53fb0abbfa77c84cbe4f0af997132 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 17:36:00 -0800
Subject: [PATCH 46/97] [NFC][SpecialCaseList] Store SectionStr as StringRef
 (#167278)

The SectionStr is already copied in addSection, so
there is no need to copy it again in the Section
constructor.

Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
---
 llvm/include/llvm/Support/SpecialCaseList.h | 2 +-
 llvm/lib/Support/SpecialCaseList.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index dee4db584c88b..35411a1dea06b 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -232,7 +232,7 @@ class SpecialCaseList {
     findMatcher(StringRef Prefix, StringRef Category) const;
 
     Matcher SectionMatcher;
-    std::string SectionStr;
+    StringRef SectionStr;
     SectionEntries Entries;
     unsigned FileIdx;
   };
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index beec8b8ef0d5b..42c8933a43399 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -243,10 +243,10 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error,
 Expected<SpecialCaseList::Section *>
 SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
                             unsigned LineNo, bool UseGlobs) {
+  SectionStr = SectionStr.copy(StrAlloc);
   Sections.emplace_back(SectionStr, FileNo, UseGlobs);
   auto &Section = Sections.back();
 
-  SectionStr = SectionStr.copy(StrAlloc);
   if (auto Err = Section.SectionMatcher.insert(SectionStr, LineNo)) {
     return createStringError(errc::invalid_argument,
                              "malformed section at line " + Twine(LineNo) +

From 93b3c1b7e114ca50e6687d8c0a0d30508063616d Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 10 Nov 2025 17:40:51 -0800
Subject: [PATCH 47/97] Revert "Reapply "[mlir] Add FP software implementation
 lowering pass: `arith-to-apfloat` (#166618)" (#167431)" (#167435)

This reverts commit 0e639ae6e030ade849fa7a09cb7dc40b42f25373.
---
 .../ArithToAPFloat/ArithToAPFloat.h           |  21 ---
 mlir/include/mlir/Conversion/Passes.h         |   1 -
 mlir/include/mlir/Conversion/Passes.td        |  15 --
 mlir/include/mlir/Dialect/Func/Utils/Utils.h  |   7 -
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   4 -
 .../ArithToAPFloat/ArithToAPFloat.cpp         | 161 ------------------
 .../Conversion/ArithToAPFloat/CMakeLists.txt  |  18 --
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |   1 -
 mlir/lib/Conversion/CMakeLists.txt            |   1 -
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  14 --
 mlir/lib/Dialect/Func/Utils/Utils.cpp         |  25 ---
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   |  11 --
 mlir/lib/ExecutionEngine/APFloatWrappers.cpp  |  81 ---------
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  12 --
 .../ArithToApfloat/arith-to-apfloat.mlir      | 128 --------------
 .../Arith/CPU/test-apfloat-emulation.mlir     |  34 ----
 16 files changed, 534 deletions(-)
 delete mode 100644 mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
 delete mode 100644 mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
 delete mode 100644 mlir/lib/ExecutionEngine/APFloatWrappers.cpp
 delete mode 100644 mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
deleted file mode 100644
index 64a42a228199e..0000000000000
--- a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
-//
-// Part of the APFloat Project, under the Apache License v2.0 with APFloat
-// Exceptions. See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
-
-#include <memory>
-
-namespace mlir {
-class Pass;
-
-#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 82bdfd02661a6..40d866ec7bf10 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,7 +12,6 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 79bc380dbcb7a..70e3e45c225db 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,21 +186,6 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
-//===----------------------------------------------------------------------===//
-// ArithToAPFloat
-//===----------------------------------------------------------------------===//
-
-def ArithToAPFloatConversionPass
-    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
-  let summary = "Convert Arith ops to APFloat runtime library calls";
-  let description = [{
-    This pass converts supported Arith ops to APFloat-based runtime library
-    calls (APFloatWrappers.cpp). APFloat is a software implementation of
-    floating-point arithmetic operations.
-  }];
-  let dependentDialects = ["func::FuncDialect"];
-}
-
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 00d50874a2e8d..3576126a487ac 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,13 +60,6 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
-/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
-/// `name`. Return a failure if the FuncOp is found but with a different
-/// signature.
-FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                               FunctionType funcT,
-                               SymbolTableCollection *symbolTables = nullptr);
-
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index b09d32022e348..8ad9ed18acebd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,10 +52,6 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
-FailureOr<LLVM::LLVMFuncOp>
-lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                             SymbolTableCollection *symbolTables = nullptr);
-
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
deleted file mode 100644
index 012e934d3050f..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Utils/Utils.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "arith-to-apfloat"
-
-namespace mlir {
-#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
-#include "mlir/Conversion/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::func;
-
-static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
-                           StringRef name, FunctionType funcT, bool setPrivate,
-                           SymbolTableCollection *symbolTables = nullptr) {
-  OpBuilder::InsertionGuard g(b);
-  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
-  b.setInsertionPointToStart(&symTable->getRegion(0).front());
-  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
-  if (setPrivate)
-    funcOp.setPrivate();
-  if (symbolTables) {
-    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
-    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
-  }
-  return funcOp;
-}
-
-/// Helper function to look up or create the symbol for a runtime library
-/// function for a binary arithmetic operation.
-///
-/// Parameter 1: APFloat semantics
-/// Parameter 2: Left-hand side operand
-/// Parameter 3: Right-hand side operand
-///
-/// This function will return a failure if the function is found but has an
-/// unexpected signature.
-///
-static FailureOr<FuncOp>
-lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
-                       SymbolTableCollection *symbolTables = nullptr) {
-  auto i32Type = IntegerType::get(symTable->getContext(), 32);
-  auto i64Type = IntegerType::get(symTable->getContext(), 64);
-
-  std::string funcName = (llvm::Twine("__mlir_apfloat_") + name).str();
-  FunctionType funcT =
-      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
-  FailureOr<FuncOp> func =
-      lookupFnDecl(symTable, funcName, funcT, symbolTables);
-  // Failed due to type mismatch.
-  if (failed(func))
-    return func;
-  // Successfully matched existing decl.
-  if (*func)
-    return *func;
-
-  return createFnDecl(b, symTable, funcName, funcT,
-                      /*setPrivate=*/true, symbolTables);
-}
-
-/// Rewrite a binary arithmetic operation to an APFloat function call.
-template <typename OpTy, const char *APFloatName>
-struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
-  BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit,
-                                   SymbolOpInterface symTable)
-      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {};
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Get APFloat function from runtime library.
-    FailureOr<FuncOp> fn =
-        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
-    if (failed(fn))
-      return fn;
-
-    rewriter.setInsertionPoint(op);
-    // Cast operands to 64-bit integers.
-    Location loc = op.getLoc();
-    auto floatTy = cast<FloatType>(op.getType());
-    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
-    auto int64Type = rewriter.getI64Type();
-    Value lhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
-    Value rhsBits = arith::ExtUIOp::create(
-        rewriter, loc, int64Type,
-        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
-
-    // Call APFloat function.
-    int32_t sem =
-        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-    Value semValue = arith::ConstantOp::create(
-        rewriter, loc, rewriter.getI32Type(),
-        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
-    auto resultOp =
-        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
-                             SymbolRefAttr::get(*fn), params);
-
-    // Truncate result to the original width.
-    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
-                                                  resultOp->getResult(0));
-    rewriter.replaceOp(
-        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
-    return success();
-  }
-
-  SymbolOpInterface symTable;
-};
-
-namespace {
-struct ArithToAPFloatConversionPass final
-    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(context);
-    static const char add[] = "add";
-    static const char subtract[] = "subtract";
-    static const char multiply[] = "multiply";
-    static const char divide[] = "divide";
-    static const char remainder[] = "remainder";
-    patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp, add>,
-                 BinaryArithOpToAPFloatConversion<arith::SubFOp, subtract>,
-                 BinaryArithOpToAPFloatConversion<arith::MulFOp, multiply>,
-                 BinaryArithOpToAPFloatConversion<arith::DivFOp, divide>,
-                 BinaryArithOpToAPFloatConversion<arith::RemFOp, remainder>>(
-        context, 1, getOperation());
-    LogicalResult result = success();
-    ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
-      if (diag.getSeverity() == DiagnosticSeverity::Error) {
-        result = failure();
-      }
-      // NB: if you don't return failure, no other diag handlers will fire (see
-      // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
-      return failure();
-    });
-    walkAndApplyPatterns(getOperation(), std::move(patterns));
-    if (failed(result))
-      return signalPassFailure();
-  }
-};
-} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
deleted file mode 100644
index b5ec49c087163..0000000000000
--- a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_mlir_conversion_library(MLIRArithToAPFloat
-  ArithToAPFloat.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
-
-  DEPENDS
-  MLIRConversionPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRArithDialect
-  MLIRArithTransforms
-  MLIRFuncDialect
-  MLIRFuncUtils
-  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..b6099902cc337 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 613dc6d242ceb..bebf1b8fff3f9 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
-add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index c747e1b59558a..69a317ecd101f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,20 +1654,6 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
-    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
-      // Print other floating-point types using the APFloat runtime library.
-      int32_t sem =
-          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-      Value semValue = LLVM::ConstantOp::create(
-          rewriter, loc, rewriter.getI32Type(),
-          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
-      Value floatBits =
-          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
-      printer =
-          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
-      emitCall(rewriter, loc, printer.value(),
-               ValueRange({semValue, floatBits}));
-      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index d6dfd0229963c..b4cb0932ef631 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,28 +254,3 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
-
-FailureOr<func::FuncOp>
-func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
-                   FunctionType funcT, SymbolTableCollection *symbolTables) {
-  FuncOp func;
-  if (symbolTables) {
-    func = symbolTables->lookupSymbolIn<FuncOp>(
-        symTable, StringAttr::get(symTable->getContext(), name));
-  } else {
-    func = llvm::dyn_cast_or_null<FuncOp>(
-        SymbolTable::lookupSymbolIn(symTable, name));
-  }
-
-  if (!func)
-    return func;
-
-  mlir::FunctionType foundFuncT = func.getFunctionType();
-  // Assert the signature of the found function is same as expected
-  if (funcT != foundFuncT) {
-    return func.emitError("matched function '")
-           << name << "' but with different type: " << foundFuncT
-           << " (expected " << funcT << ")";
-  }
-  return func;
-}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 160b6ae89215c..feaffa34897b6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,7 +30,6 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
-static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -161,16 +160,6 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
-FailureOr<LLVM::LLVMFuncOp>
-mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
-                                         SymbolTableCollection *symbolTables) {
-  return lookupOrCreateReservedFn(
-      b, moduleOp, kPrintApFloat,
-      {IntegerType::get(moduleOp->getContext(), 32),
-       IntegerType::get(moduleOp->getContext(), 64)},
-      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
-}
-
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
deleted file mode 100644
index 85ea0986cde5b..0000000000000
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file exposes the APFloat infrastructure to MLIR programs as a runtime
-// library. APFloat is a software implementation of floating point arithmetics.
-//
-// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
-// before calling a runtime function. If a floating-point type has less than
-// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
-// integer.
-//
-// Runtime functions receive the floating-point operands of the arithmeic
-// operation in the form of 64-bit integers, along with the APFloat semantics
-// in the form of a 32-bit integer, which will be interpreted as an
-// APFloatBase::Semantics enum value.
-//
-#include "llvm/ADT/APFloat.h"
-
-#if (defined(_WIN32) || defined(__CYGWIN__))
-#define MLIR_APFLOAT_WRAPPERS_EXPORTED __declspec(dllexport)
-#else
-#define MLIR_APFLOAT_WRAPPERS_EXPORTED __attribute__((visibility("default")))
-#endif
-
-/// Binary operations without rounding mode.
-#define APFLOAT_BINARY_OP(OP)                                                  \
-  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs);                                                               \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-/// Binary operations with rounding mode.
-#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
-  int64_t MLIR_APFLOAT_WRAPPERS_EXPORTED __mlir_apfloat_##OP(                  \
-      int32_t semantics, uint64_t a, uint64_t b) {                             \
-    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
-        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
-    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
-    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
-    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
-    lhs.OP(rhs, ROUNDING_MODE);                                                \
-    return lhs.bitcastToAPInt().getZExtValue();                                \
-  }
-
-extern "C" {
-
-#define BIN_OPS_WITH_ROUNDING(X)                                               \
-  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
-  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
-  X(divide, llvm::RoundingMode::NearestTiesToEven)
-
-BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
-#undef BIN_OPS_WITH_ROUNDING
-#undef APFLOAT_BINARY_OP_ROUNDING_MODE
-
-APFLOAT_BINARY_OP(remainder)
-
-#undef APFLOAT_BINARY_OP
-
-void MLIR_APFLOAT_WRAPPERS_EXPORTED printApFloat(int32_t semantics,
-                                                 uint64_t a) {
-  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
-      static_cast<llvm::APFloatBase::Semantics>(semantics));
-  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
-  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
-  double d = x.convertToDouble();
-  fprintf(stdout, "%lg", d);
-}
-}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 8c09e50e4de7b..fdeb4dacf9278 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,7 +2,6 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
-  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -168,15 +167,6 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
-  add_mlir_library(mlir_apfloat_wrappers
-    SHARED
-    APFloatWrappers.cpp
-
-    EXCLUDE_FROM_LIBMLIR
-    )
-  set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17)
-  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
-
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -187,7 +177,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -202,7 +191,6 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
-    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
deleted file mode 100644
index fe4d28a56f808..0000000000000
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ /dev/null
@@ -1,128 +0,0 @@
-// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL:   func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
-
-// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
-// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
-// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
-// CHECK:         }
-
-// Illustrate that both f8E4M3FN and f6E3M2FN calling the same __mlir_apfloat_add is fine
-// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
-// CHECK-LABEL:   func.func @full_example() {
-// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
-// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
-// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
-// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
-// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
-//                  // fltSemantics semantics for f8E4M3FN
-// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
-// CHECK:           %[[VAL_1:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
-// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
-// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
-
-// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
-// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
-// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
-// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
-// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
-//                  // fltSemantics semantics for f6E3M2FN
-// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
-// CHECK:           %[[VAL_3:.*]] = call @__mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
-// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
-// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
-// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
-// CHECK:           return
-// CHECK:         }
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> f8E4M3FN {
-  %cst = arith.constant 2.2 : f8E4M3FN
-  return %cst : f8E4M3FN
-}
-
-func.func @bar() -> f6E3M2FN {
-  %cst = arith.constant 3.2 : f6E3M2FN
-  return %cst : f6E3M2FN
-}
-
-func.func @full_example() {
-  %a = arith.constant 1.4 : f8E4M3FN
-  %b = func.call @foo() : () -> (f8E4M3FN)
-  %c = arith.addf %a, %b : f8E4M3FN
-  vector.print %c : f8E4M3FN
-
-  %d = arith.constant 2.4 : f6E3M2FN
-  %e = func.call @bar() : () -> (f6E3M2FN)
-  %f = arith.addf %d, %e : f6E3M2FN
-  vector.print %f : f6E3M2FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_add(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// Test decl collision (different type)
-// expected-error@+1{{matched function '__mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
-func.func private @__mlir_apfloat_add(i32, i32, f32) -> index
-func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_subtract(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_multiply(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_divide(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
-  return
-}
-
-// -----
-
-// CHECK: func.func private @__mlir_apfloat_remainder(i32, i64, i64) -> i64
-// CHECK: %[[sem:.*]] = arith.constant 18 : i32
-// CHECK: call @__mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
-func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
-  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
deleted file mode 100644
index a2b3eb73a60b8..0000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// Case 1: All floating-point arithmetics is lowered through APFloat.
-// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
-
-// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
-//         Arithmetics on f32 is lowered directly to LLVM.
-// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
-// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
-// RUN: mlir-runner -e entry --entry-point-result=void \
-// RUN:             --shared-libs=%mlir_c_runner_utils | FileCheck %s
-
-// Put rhs into separate function so that it won't be constant-folded.
-func.func @foo() -> (f8E4M3FN, f32) {
-  %cst1 = arith.constant 2.2 : f8E4M3FN
-  %cst2 = arith.constant 2.2 : f32
-  return %cst1, %cst2 : f8E4M3FN, f32
-}
-
-func.func @entry() {
-  %a1 = arith.constant 1.4 : f8E4M3FN
-  %a2 = arith.constant 1.4 : f32
-  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
-  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
-  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
-
-  // CHECK: 3.5
-  vector.print %c1 : f8E4M3FN
-
-  // CHECK: 3.6
-  vector.print %c2 : f32
-
-  return
-}

From 7e043364cf1f267380b2be955901b28ede9ca8eb Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 10 Nov 2025 17:56:03 -0800
Subject: [PATCH 48/97] Revert "[DFAJumpThreading] Enable DFAJumpThread by
 default." (#167352)

Reverts llvm/llvm-project#157646, DFAJumpThread is causing miscompiles
when building Clang with PGO, see llvm/llvm-project#166868 for details.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp                      | 2 +-
 llvm/test/Other/new-pm-defaults.ll                            | 1 -
 llvm/test/Other/new-pm-thinlto-postlink-defaults.ll           | 1 -
 llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll       | 1 -
 llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll | 1 -
 llvm/test/Other/new-pm-thinlto-prelink-defaults.ll            | 1 -
 llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll        | 1 -
 llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll  | 1 -
 8 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 2fe963b3b68d9..dd73c04959732 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication(
 static cl::opt<bool>
     EnableDFAJumpThreading("enable-dfa-jump-thread",
                            cl::desc("Enable DFA jump threading"),
-                           cl::init(true), cl::Hidden);
+                           cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     EnableHotColdSplit("hot-cold-split",
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index b59d4cf6af998..1f437a662cc96 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -208,7 +208,6 @@
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index c1d8b42505c84..2d8b8f1b22091 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -133,7 +133,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 45f090252eaf7..7cacc17c7ab9a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -118,7 +118,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 4c330f44d30cc..ef6cd8354ae3d 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -127,7 +127,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index b61edc805108d..dd6acd2c51ee7 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -165,7 +165,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index acf8c053d0f82..ee054527e20bd 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -167,7 +167,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 6b3c5ca7605de..fd95e94f3c8b9 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -131,7 +131,6 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass

From 90b4b86bccd31de8c018a9aa560b8415c60febf8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 17:58:24 -0800
Subject: [PATCH 49/97] [NFC][Support] Replace "Return" with "Returns" in
 comments (#167395)

Fixes typos in comments in `SpecialCaseList.h`.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 llvm/include/llvm/Support/SpecialCaseList.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 35411a1dea06b..2e5cbb54a49a1 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -203,13 +203,13 @@ class SpecialCaseList {
 
     Section(Section &&) = default;
 
-    // Return name of the section, its entire string in [].
+    // Returns name of the section, its entire string in [].
     StringRef name() const { return SectionStr; }
 
     // Returns true if string 'Name' matches section name interpreted as a glob.
     LLVM_ABI bool matchName(StringRef Name) const;
 
-    // Return sequence number of the file where this section is defined.
+    // Returns sequence number of the file where this section is defined.
     unsigned fileIndex() const { return FileIdx; }
 
     // Helper method to search by Prefix, Query, and Category. Returns

From b7ceda3bc9931b86f296a29bed2f252c47389081 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 18:04:52 -0800
Subject: [PATCH 50/97] MachineCombiner: Partially fix losing subregister
 indexes (#141661)

This fixes verifier errors in this test after earlier passes start
introducing more subregister uses. This probably isn't adequately
tested but I know nothing about this pass.
---
 llvm/lib/CodeGen/TargetInstrInfo.cpp          | 15 +++++--
 .../AArch64/machine-combiner-subregs.mir      | 35 +++++++++++++++++
 ...machine-combiner-subreg-verifier-error.mir | 39 +++++++++++++++++++
 3 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir

diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 76f99848a823c..d503d7a2345fd 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1330,9 +1330,12 @@ void TargetInstrInfo::reassociateOps(
   MachineOperand &OpC = Root.getOperand(0);
 
   Register RegA = OpA.getReg();
+  unsigned SubRegA = OpA.getSubReg();
   Register RegB = OpB.getReg();
   Register RegX = OpX.getReg();
+  unsigned SubRegX = OpX.getSubReg();
   Register RegY = OpY.getReg();
+  unsigned SubRegY = OpY.getSubReg();
   Register RegC = OpC.getReg();
 
   if (RegA.isVirtual())
@@ -1350,6 +1353,7 @@ void TargetInstrInfo::reassociateOps(
   // recycling RegB because the MachineCombiner's computation of the critical
   // path requires a new register definition rather than an existing one.
   Register NewVR = MRI.createVirtualRegister(RC);
+  unsigned SubRegNewVR = 0;
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
 
   auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev);
@@ -1362,6 +1366,7 @@ void TargetInstrInfo::reassociateOps(
 
   if (SwapPrevOperands) {
     std::swap(RegX, RegY);
+    std::swap(SubRegX, SubRegY);
     std::swap(KillX, KillY);
   }
 
@@ -1414,9 +1419,9 @@ void TargetInstrInfo::reassociateOps(
     if (Idx == 0)
       continue;
     if (Idx == PrevFirstOpIdx)
-      MIB1.addReg(RegX, getKillRegState(KillX));
+      MIB1.addReg(RegX, getKillRegState(KillX), SubRegX);
     else if (Idx == PrevSecondOpIdx)
-      MIB1.addReg(RegY, getKillRegState(KillY));
+      MIB1.addReg(RegY, getKillRegState(KillY), SubRegY);
     else
       MIB1.add(MO);
   }
@@ -1424,6 +1429,7 @@ void TargetInstrInfo::reassociateOps(
 
   if (SwapRootOperands) {
     std::swap(RegA, NewVR);
+    std::swap(SubRegA, SubRegNewVR);
     std::swap(KillA, KillNewVR);
   }
 
@@ -1435,9 +1441,9 @@ void TargetInstrInfo::reassociateOps(
     if (Idx == 0)
       continue;
     if (Idx == RootFirstOpIdx)
-      MIB2 = MIB2.addReg(RegA, getKillRegState(KillA));
+      MIB2 = MIB2.addReg(RegA, getKillRegState(KillA), SubRegA);
     else if (Idx == RootSecondOpIdx)
-      MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR));
+      MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR), SubRegNewVR);
     else
       MIB2 = MIB2.add(MO);
   }
@@ -1525,6 +1531,7 @@ void TargetInstrInfo::genAlternativeCodeSequence(
       if (IndexedReg.index() == 0)
         continue;
 
+      // FIXME: Losing subregisters
       MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
       MachineInstrBuilder MIB;
       Register AccReg;
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir
new file mode 100644
index 0000000000000..c96a0385c3a4e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -run-pass=machine-combiner -o - %s | FileCheck %s
+
+# Make sure machine combiner doesn't drop subregister indexes.
+
+---
+name:            reassociate_adds2_reassoc
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1, $q2, $q3
+
+    ; CHECK-LABEL: name: reassociate_adds2_reassoc
+    ; CHECK: liveins: $q0, $q1, $q2, $q3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q3
+    ; CHECK-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY]].ssub, [[COPY1]].ssub, implicit $fpcr
+    ; CHECK-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY2]].ssub, [[COPY3]].ssub, implicit $fpcr
+    ; CHECK-NEXT: [[FADDSrr2:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr killed [[FADDSrr1]], killed [[FADDSrr]], implicit $fpcr
+    ; CHECK-NEXT: $s0 = COPY [[FADDSrr2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $s0
+    %0:fpr128 = COPY $q0
+    %1:fpr128 = COPY $q1
+    %2:fpr128 = COPY $q2
+    %3:fpr128 = COPY $q3
+    %4:fpr32 = nsz reassoc nofpexcept FADDSrr %0.ssub, %1.ssub, implicit $fpcr
+    %5:fpr32 = nsz reassoc nofpexcept FADDSrr %2.ssub, killed %4, implicit $fpcr
+    %6:fpr32 = nsz reassoc nofpexcept FADDSrr killed %5, %3.ssub, implicit $fpcr
+    $s0 = COPY %6
+    RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir
new file mode 100644
index 0000000000000..76dfd4e746bea
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=machine-combiner -o - %s | FileCheck %s
+
+# Make sure the verifier doesn't fail due to dropping subregister
+# uses.
+
+---
+name:            machine_combiner_subreg_verifier_error
+tracksRegLiveness: true
+isSSA:           true
+body:             |
+  bb.0:
+    liveins: $v8m4, $v12m4
+
+    ; CHECK-LABEL: name: machine_combiner_subreg_verifier_error
+    ; CHECK: liveins: $v8m4, $v12m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gprnox0 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[PseudoVSLIDEDOWN_VI_M8_:%[0-9]+]]:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, [[DEF2]], 26, 2, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVADD_VV_MF2_:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[DEF2]].sub_vrm1_0, killed [[DEF3]], 2, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVADD_VV_MF2_1:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[PseudoVSLIDEDOWN_VI_M8_]].sub_vrm1_0, killed [[PseudoVADD_VV_MF2_]], 2, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:vrm4 = IMPLICIT_DEF
+    %1:gprnox0 = IMPLICIT_DEF
+    %2:vrm8 = IMPLICIT_DEF
+    %3:vr = IMPLICIT_DEF
+    %4:vrm2 = IMPLICIT_DEF
+    %5:vr = IMPLICIT_DEF
+    %6:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, %2, 26, 2, 5 /* e32 */, 3 /* ta, ma */
+    %7:vr = PseudoVADD_VV_MF2 $noreg, %6.sub_vrm1_0, %2.sub_vrm1_0, 2, 5 /* e32 */, 1 /* ta, mu */
+    %8:vr = PseudoVADD_VV_MF2 $noreg, killed %7, killed %3, 2, 5 /* e32 */, 1 /* ta, mu */
+    PseudoRET implicit $v8
+
+...

From 86fa018a1dca99a1a199e1a0e6f5730546198824 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Tue, 11 Nov 2025 03:15:58 +0100
Subject: [PATCH 51/97] [Flang][OpenMP] Initial defaultmap(none) implementation
 (#166715)

This PR adds defaultmap(none) behaviour to Flang, where we emit a
semantic error if variables within the target construct do not have an
associated data attribute. Similar to the way default behaves, as
described by the OpenMP specification.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  7 +-
 flang/lib/Semantics/resolve-directives.cpp    | 91 ++++++++++++++++++
 .../Todo/defaultmap-clause-firstprivate.f90   |  2 +-
 .../OpenMP/Todo/defaultmap-clause-none.f90    | 11 ---
 .../OpenMP/defaultmap-clause-none.f90         | 96 +++++++++++++++++++
 5 files changed, 191 insertions(+), 16 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90
 create mode 100644 flang/test/Semantics/OpenMP/defaultmap-clause-none.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ad456d89bc432..4429a9a94ed3c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1008,9 +1008,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Firstprivate:
-  case DefMap::ImplicitBehavior::None:
-    TODO(loc, "Firstprivate and None are currently unsupported defaultmap "
-              "behaviour");
+    TODO(loc, "Firstprivate is currently unsupported defaultmap behaviour");
     break;
   case DefMap::ImplicitBehavior::From:
     return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from,
@@ -1032,8 +1030,9 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Default:
+  case DefMap::ImplicitBehavior::None:
     llvm_unreachable(
-        "Implicit None Behaviour Should Have Been Handled Earlier");
+        "Implicit None and Default behaviour should have been handled earlier");
     break;
   }
 
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index deb57e005a352..224c69163b85e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2965,6 +2965,67 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
   }
 }
 
+static bool IsOpenMPPointer(const Symbol &symbol) {
+  if (IsPointer(symbol) || IsBuiltinCPtr(symbol))
+    return true;
+  return false;
+}
+
+static bool IsOpenMPAggregate(const Symbol &symbol) {
+  if (IsAllocatable(symbol) || IsOpenMPPointer(symbol))
+    return false;
+
+  const auto *type{symbol.GetType()};
+  // OpenMP categorizes Fortran characters as aggregates.
+  if (type->category() == Fortran::semantics::DeclTypeSpec::Category::Character)
+    return true;
+
+  if (const auto *det{symbol.GetUltimate()
+              .detailsIf<Fortran::semantics::ObjectEntityDetails>()})
+    if (det->IsArray())
+      return true;
+
+  if (type->AsDerived())
+    return true;
+
+  if (IsDeferredShape(symbol) || IsAssumedRank(symbol) ||
+      IsAssumedShape(symbol))
+    return true;
+  return false;
+}
+
+static bool IsOpenMPScalar(const Symbol &symbol) {
+  if (IsOpenMPAggregate(symbol) || IsOpenMPPointer(symbol) ||
+      IsAllocatable(symbol))
+    return false;
+  const auto *type{symbol.GetType()};
+  if ((!symbol.GetShape() || symbol.GetShape()->empty()) &&
+      (type->category() ==
+              Fortran::semantics::DeclTypeSpec::Category::Numeric ||
+          type->category() ==
+              Fortran::semantics::DeclTypeSpec::Category::Logical))
+    return true;
+  return false;
+}
+
+static bool DefaultMapCategoryMatchesSymbol(
+    parser::OmpVariableCategory::Value category, const Symbol &symbol) {
+  using VarCat = parser::OmpVariableCategory::Value;
+  switch (category) {
+  case VarCat::Scalar:
+    return IsOpenMPScalar(symbol);
+  case VarCat::Allocatable:
+    return IsAllocatable(symbol);
+  case VarCat::Aggregate:
+    return IsOpenMPAggregate(symbol);
+  case VarCat::Pointer:
+    return IsOpenMPPointer(symbol);
+  case VarCat::All:
+    return true;
+  }
+  return false;
+}
+
 // For OpenMP constructs, check all the data-refs within the constructs
 // and adjust the symbol for each Name if necessary
 void OmpAttributeVisitor::Post(const parser::Name &name) {
@@ -3000,6 +3061,36 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       }
     }
 
+    // TODO: handle case where default and defaultmap are present on the same
+    // construct and conflict, defaultmap should supersede default if they
+    // conflict.
+    if (!GetContext().defaultMap.empty()) {
+      // Checked before implicit data sharing attributes as this rule ignores
+      // them and expects explicit predetermined/specified attributes to be in
+      // place for the types specified.
+      if (Symbol * found{currScope().FindSymbol(name.source)}) {
+        // If the variable has declare target applied to it (enter or link) it
+        // is exempt from defaultmap(none) restrictions
+        if (!symbol->GetUltimate().test(Symbol::Flag::OmpDeclareTarget)) {
+          auto &dMap = GetContext().defaultMap;
+          for (auto defaults : dMap) {
+            if (defaults.second ==
+                parser::OmpDefaultmapClause::ImplicitBehavior::None) {
+              if (DefaultMapCategoryMatchesSymbol(defaults.first, *found)) {
+                if (!IsObjectWithDSA(*symbol)) {
+                  context_.Say(name.source,
+                      "The DEFAULTMAP(NONE) clause requires that '%s' must be "
+                      "listed in a "
+                      "data-sharing attribute, data-mapping attribute, or is_device_ptr clause"_err_en_US,
+                      symbol->name());
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
     if (Symbol * found{currScope().FindSymbol(name.source)}) {
       if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate))
         return;
diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
index 6818c39f63a3c..1e0d9694258cc 100644
--- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
+++ b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90
@@ -6,7 +6,7 @@ subroutine f00
     ! NOTE: This is implemented for scalars as it is the default behaviour, so we utilise
     ! a different data type.
     integer, allocatable :: i
-    !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour
+    !CHECK: not yet implemented: Firstprivate is currently unsupported defaultmap behaviour
     !$omp target defaultmap(firstprivate)
       i = 10
     !$omp end target
diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90
deleted file mode 100644
index 287eb4a9dfe8f..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
-!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
-
-subroutine f00
-  implicit none
-  integer :: i
-  !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour
-  !$omp target defaultmap(none)
-    i = 10
-  !$omp end target
-end
diff --git a/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90
new file mode 100644
index 0000000000000..08e8ebc995097
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90
@@ -0,0 +1,96 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+subroutine defaultmap_all_none_no_errors
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none) map(to: index, alloca) map(tofrom: array, ptr)
+        do index = 1, 10
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_all_none_no_errors
+
+subroutine defaultmap_all_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+    !$omp target defaultmap(none)
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_all_none
+
+subroutine defaultmap_scalar_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: scalar)
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_scalar_none
+
+subroutine defaultmap_pointer_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: pointer)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_pointer_none
+
+subroutine defaultmap_allocatable_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: allocatable)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_allocatable_none
+
+subroutine defaultmap_aggregate_none
+    implicit none
+    real :: array(10)
+    integer,  pointer :: ptr(:)
+    real, allocatable :: alloca
+    integer :: index
+
+    !$omp target defaultmap(none: aggregate)
+        do index = 1, 10
+!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause
+            ptr(index) = array(index) + alloca
+        end do
+    !$omp end target
+end subroutine defaultmap_aggregate_none

From cdc6f858e8a8892739632911b78bf379551233f2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 10 Nov 2025 18:19:29 -0800
Subject: [PATCH 52/97] [Github] Add Workflow to test Unprivileged Download
 Artifact Action

Since this is a composite action, we do not get any testing currently
when updating the action. This patch adds a simple workflow to test the
action so we can ensure we do not break it when modifying it.

Reviewers: tstellar

Reviewed By: tstellar

Pull Request: https://github.com/llvm/llvm-project/pull/167434
---
 .../test-unprivileged-download-artifact.yml   | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/workflows/test-unprivileged-download-artifact.yml

diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml
new file mode 100644
index 0000000000000..a9c0912b0f44e
--- /dev/null
+++ b/.github/workflows/test-unprivileged-download-artifact.yml
@@ -0,0 +1,54 @@
+name: Test Unprivileged Download Artifact Action
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/test-unprivileged-download-artifact.yml
+      - '.github/workflows/unprivileged-download-artifact/**'
+  pull_request:
+    paths:
+      - .github/workflows/test-unprivileged-download-artifact.yml
+      - '.github/workflows/unprivileged-download-artifact/**'
+
+jobs:
+  upload-test-artifact:
+    name: Upload Test Artifact
+    if: github.repository_owner == 'llvm'
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Create Test File
+        run: |
+          echo "test" > comment
+      - name: Upload Test File
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        with:
+          name: workflow-args
+          path: |
+            comment
+
+  test-download:
+    name: Test Unprivileged Download Artifact
+    if: github.repository_owner == 'llvm'
+    runs-on: ubuntu-24.04
+    needs: [ upload-test-artifact ]
+    steps:
+      - name: Chekcout LLVM
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          sparse-checkout: |
+            .github/workflows/unprivileged-download-artifact/action.yml
+      - name: Download Artifact
+        uses: ./.github/workflows/unprivileged-download-artifact
+        id: download-artifact
+        with:
+          run-id: ${{ github.run_id }}
+          artifact-name: workflow-args
+      - name: Assert That Contents are the Same
+        run: |
+          cat comment
+          [[ "$(cat comment)" == "test" ]]

From ae1e2ec8a2c6cb48636e737a737ae225005e8e49 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 18:27:03 -0800
Subject: [PATCH 53/97] [NFC][SpecialCaseList] Rename `Section::SectionStr` to
 `Name` (#167279)

---
 llvm/include/llvm/Support/SpecialCaseList.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 2e5cbb54a49a1..e92aeef2bfaae 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -197,14 +197,14 @@ class SpecialCaseList {
 protected:
   class Section {
   public:
-    Section(StringRef Str, unsigned FileIdx, bool UseGlobs)
-        : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str),
+    Section(StringRef Name, unsigned FileIdx, bool UseGlobs)
+        : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), Name(Name),
           FileIdx(FileIdx) {}
 
     Section(Section &&) = default;
 
     // Returns name of the section, its entire string in [].
-    StringRef name() const { return SectionStr; }
+    StringRef name() const { return Name; }
 
     // Returns true if string 'Name' matches section name interpreted as a glob.
     LLVM_ABI bool matchName(StringRef Name) const;
@@ -232,7 +232,7 @@ class SpecialCaseList {
     findMatcher(StringRef Prefix, StringRef Category) const;
 
     Matcher SectionMatcher;
-    StringRef SectionStr;
+    StringRef Name;
     SectionEntries Entries;
     unsigned FileIdx;
   };

From bdf37f49f904b695214aec1332f7f248ed7bd873 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 18:48:39 -0800
Subject: [PATCH 54/97] unittests: Convert some tests to opaque pointers
 (#167443)

---
 llvm/unittests/Analysis/AliasAnalysisTest.cpp |  50 ++---
 .../Analysis/AliasSetTrackerTest.cpp          |   6 +-
 .../Analysis/AssumeBundleQueriesTest.cpp      |  90 ++++----
 .../Analysis/CGSCCPassManagerTest.cpp         |  16 +-
 .../Analysis/CaptureTrackingTest.cpp          |  40 ++--
 llvm/unittests/Analysis/DDGTest.cpp           |  44 ++--
 .../FunctionPropertiesAnalysisTest.cpp        |  44 ++--
 llvm/unittests/Analysis/LazyCallGraphTest.cpp | 208 +++++++++---------
 llvm/unittests/Analysis/SparsePropagation.cpp |   8 +-
 .../unittests/Analysis/UnrollAnalyzerTest.cpp |  30 +--
 llvm/unittests/Analysis/ValueTrackingTest.cpp |  12 +-
 llvm/unittests/MIR/MachineMetadata.cpp        |  12 +-
 .../WebAssemblyExceptionInfoTest.cpp          |   4 +-
 .../Transforms/IPO/AttributorTest.cpp         |  12 +-
 llvm/unittests/Transforms/Scalar/LICMTest.cpp |  12 +-
 .../Transforms/Scalar/LoopPassManagerTest.cpp |  44 ++--
 .../Transforms/Utils/BasicBlockUtilsTest.cpp  |   8 +-
 .../Transforms/Utils/CloningTest.cpp          |  10 +-
 .../Transforms/Utils/CodeExtractorTest.cpp    |  60 ++---
 .../Transforms/Utils/CodeMoverUtilsTest.cpp   | 134 +++++------
 llvm/unittests/Transforms/Utils/LocalTest.cpp |  14 +-
 .../Transforms/Utils/MemTransferLowering.cpp  |  36 +--
 .../Utils/ScalarEvolutionExpanderTest.cpp     |  16 +-
 .../Transforms/Utils/UnrollLoopTest.cpp       |  14 +-
 .../Transforms/Utils/ValueMapperTest.cpp      |   2 +-
 25 files changed, 463 insertions(+), 463 deletions(-)

diff --git a/llvm/unittests/Analysis/AliasAnalysisTest.cpp b/llvm/unittests/Analysis/AliasAnalysisTest.cpp
index 06066b1b92c51..a28d318ab32c8 100644
--- a/llvm/unittests/Analysis/AliasAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/AliasAnalysisTest.cpp
@@ -232,18 +232,18 @@ TEST_F(AliasAnalysisTest, BatchAAPhiCycles) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i8* noalias %a, i1 %c) {
+    define void @f(ptr noalias %a, i1 %c) {
     entry:
       br label %loop
 
     loop:
-      %phi = phi i8* [ null, %entry ], [ %a2, %loop ]
+      %phi = phi ptr [ null, %entry ], [ %a2, %loop ]
       %offset1 = phi i64 [ 0, %entry ], [ %offset2, %loop]
       %offset2 = add i64 %offset1, 1
-      %a1 = getelementptr i8, i8* %a, i64 %offset1
-      %a2 = getelementptr i8, i8* %a, i64 %offset2
-      %s1 = select i1 %c, i8* %a1, i8* %phi
-      %s2 = select i1 %c, i8* %a2, i8* %a1
+      %a1 = getelementptr i8, ptr %a, i64 %offset1
+      %a2 = getelementptr i8, ptr %a, i64 %offset2
+      %s1 = select i1 %c, ptr %a1, ptr %phi
+      %s2 = select i1 %c, ptr %a2, ptr %a1
       br label %loop
     }
   )", Err, C);
@@ -280,15 +280,15 @@ TEST_F(AliasAnalysisTest, BatchAAPhiAssumption) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i8* %a.base, i8* %b.base, i1 %c) {
+    define void @f(ptr %a.base, ptr %b.base, i1 %c) {
     entry:
       br label %loop
 
     loop:
-      %a = phi i8* [ %a.next, %loop ], [ %a.base, %entry ]
-      %b = phi i8* [ %b.next, %loop ], [ %b.base, %entry ]
-      %a.next = getelementptr i8, i8* %a, i64 1
-      %b.next = getelementptr i8, i8* %b, i64 1
+      %a = phi ptr [ %a.next, %loop ], [ %a.base, %entry ]
+      %b = phi ptr [ %b.next, %loop ], [ %b.base, %entry ]
+      %a.next = getelementptr i8, ptr %a, i64 1
+      %b.next = getelementptr i8, ptr %b, i64 1
       br label %loop
     }
   )", Err, C);
@@ -318,16 +318,16 @@ TEST_F(AliasAnalysisTest, PartialAliasOffset) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @foo(float* %arg, i32 %i) {
+    define void @foo(ptr %arg, i32 %i) {
     bb:
       %i2 = zext i32 %i to i64
-      %i3 = getelementptr inbounds float, float* %arg, i64 %i2
-      %i4 = bitcast float* %i3 to <2 x float>*
-      %L1 = load <2 x float>, <2 x float>* %i4, align 16
+      %i3 = getelementptr inbounds float, ptr %arg, i64 %i2
+      %i4 = bitcast ptr %i3 to ptr
+      %L1 = load <2 x float>, ptr %i4, align 16
       %i7 = add nuw nsw i32 %i, 1
       %i8 = zext i32 %i7 to i64
-      %i9 = getelementptr inbounds float, float* %arg, i64 %i8
-      %L2 = load float, float* %i9, align 4
+      %i9 = getelementptr inbounds float, ptr %arg, i64 %i8
+      %L2 = load float, ptr %i9, align 4
       ret void
     }
   )",
@@ -353,11 +353,11 @@ TEST_F(AliasAnalysisTest, PartialAliasOffsetSign) {
   LLVMContext C;
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyString(R"(
-    define void @f(i64* %p) {
-      %L1 = load i64, i64* %p
-      %p.i8 = bitcast i64* %p to i8*
-      %q = getelementptr i8,  i8* %p.i8, i32 1
-      %L2 = load i8, i8* %q
+    define void @f(ptr %p) {
+      %L1 = load i64, ptr %p
+      %p.i8 = bitcast ptr %p to ptr
+      %q = getelementptr i8,  ptr %p.i8, i32 1
+      %L2 = load i8, ptr %q
       ret void
     }
   )",
@@ -388,10 +388,10 @@ class AAPassInfraTest : public testing::Test {
 
 public:
   AAPassInfraTest()
-      : M(parseAssemblyString("define i32 @f(i32* %x, i32* %y) {\n"
+      : M(parseAssemblyString("define i32 @f(ptr %x, ptr %y) {\n"
                               "entry:\n"
-                              "  %lx = load i32, i32* %x\n"
-                              "  %ly = load i32, i32* %y\n"
+                              "  %lx = load i32, ptr %x\n"
+                              "  %ly = load i32, ptr %y\n"
                               "  %sum = add i32 %lx, %ly\n"
                               "  ret i32 %sum\n"
                               "}\n",
diff --git a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
index e784e6eefb79c..b5adc84185c96 100644
--- a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
+++ b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
@@ -26,13 +26,13 @@ TEST(AliasSetTracker, AliasUnknownInst) {
 
     ; Function Attrs: nounwind ssp uwtable
     define i32 @read_a() #0 {
-      %1 = load i32, i32* @a, align 4, !tbaa !3
+      %1 = load i32, ptr @a, align 4, !tbaa !3
       ret i32 %1
     }
 
     ; Function Attrs: nounwind ssp uwtable
     define void @write_b() #0 {
-      store float 1.000000e+01, float* @b, align 4, !tbaa !7
+      store float 1.000000e+01, ptr @b, align 4, !tbaa !7
       ret void
     }
 
@@ -72,7 +72,7 @@ TEST(AliasSetTracker, AliasUnknownInst) {
   AliasSetTracker AST(BAA);
   for (auto &BB : *Test)
     AST.add(BB);
-  // There should be 2 disjoint alias sets. 1 from each call. 
+  // There should be 2 disjoint alias sets. 1 from each call.
   ASSERT_EQ((int)AST.getAliasSets().size(), 2);
 
   // Directly test aliasesUnknownInst.
diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
index 5fd2ecc4f29b6..921e2aa8cd30b 100644
--- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
+++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
@@ -74,18 +74,18 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
   EnableKnowledgeRetention.setValue(true);
   StringRef Head =
       "declare void @llvm.assume(i1)\n"
-      "declare void @func(i32*, i32*, i32*)\n"
-      "declare void @func1(i32*, i32*, i32*, i32*)\n"
-      "declare void @func_many(i32*) \"no-jump-tables\" nounwind "
+      "declare void @func(ptr, ptr, ptr)\n"
+      "declare void @func1(ptr, ptr, ptr, ptr)\n"
+      "declare void @func_many(ptr) \"no-jump-tables\" nounwind "
       "\"less-precise-fpmad\" willreturn norecurse\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n";
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n";
   StringRef Tail = "ret void\n"
                    "}";
   std::vector<std::pair<StringRef, llvm::function_ref<void(Instruction *)>>>
       Tests;
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* align 8 noundef %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr align 8 noundef %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -103,11 +103,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Alignment, 4));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* "
+      "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr "
       "nonnull "
-      "align 8 dereferenceable(28) %P, i32* nonnull align 64 "
+      "align 8 dereferenceable(28) %P, ptr nonnull align 64 "
       "dereferenceable(4) "
-      "%P, i32* nonnull align 16 dereferenceable(12) %P)\n",
+      "%P, ptr nonnull align 16 dereferenceable(12) %P)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -127,7 +127,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Alignment, 64));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func_many(i32* align 8 noundef %P1) cold\n", [](Instruction *I) {
+      "call void @func_many(ptr align 8 noundef %P1) cold\n", [](Instruction *I) {
         ShouldPreserveAllAttributes.setValue(true);
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -142,11 +142,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
         ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, nullptr, ""));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -178,11 +178,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
       }));
 
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -204,8 +204,8 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) {
                                      Attribute::AttrKind::Dereferenceable, 48));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* %P1)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr %P1)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -251,18 +251,18 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
   EnableKnowledgeRetention.setValue(true);
   StringRef Head =
       "declare void @llvm.assume(i1)\n"
-      "declare void @func(i32*, i32*, i32*)\n"
-      "declare void @func1(i32*, i32*, i32*, i32*)\n"
-      "declare void @func_many(i32*) \"no-jump-tables\" nounwind "
+      "declare void @func(ptr, ptr, ptr)\n"
+      "declare void @func1(ptr, ptr, ptr, ptr)\n"
+      "declare void @func_many(ptr) \"no-jump-tables\" nounwind "
       "\"less-precise-fpmad\" willreturn norecurse\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n";
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n";
   StringRef Tail = "ret void\n"
                    "}";
   std::vector<std::pair<StringRef, llvm::function_ref<void(Instruction *)>>>
       Tests;
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* align 8 dereferenceable(8) %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr align 8 dereferenceable(8) %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -283,11 +283,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
                                {4, 4}));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* "
+      "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr "
       "nonnull "
-      "align 8 dereferenceable(28) %P, i32* nonnull align 64 "
+      "align 8 dereferenceable(28) %P, ptr nonnull align 64 "
       "dereferenceable(4) "
-      "%P, i32* nonnull align 16 dereferenceable(12) %P)\n",
+      "%P, ptr nonnull align 16 dereferenceable(12) %P)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -310,7 +310,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
             Map, Assume, {I->getOperand(0), Attribute::Alignment}, {64, 64}));
       }));
   Tests.push_back(std::make_pair(
-      "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) {
+      "call void @func_many(ptr align 8 %P1) cold\n", [](Instruction *I) {
         ShouldPreserveAllAttributes.setValue(true);
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -331,11 +331,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
         ASSERT_TRUE(Map.empty());
       }));
   Tests.push_back(std::make_pair(
-      "call void @func1(i32* readnone align 32 "
-      "dereferenceable(48) noalias %P, i32* "
-      "align 8 dereferenceable(28) %P1, i32* align 64 "
+      "call void @func1(ptr readnone align 32 "
+      "dereferenceable(48) noalias %P, ptr "
+      "align 8 dereferenceable(28) %P1, ptr align 64 "
       "dereferenceable(4) "
-      "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n",
+      "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -371,8 +371,8 @@ TEST(AssumeQueryAPI, fillMapFromAssume) {
 
   /// Keep this test last as it modifies the function.
   Tests.push_back(std::make_pair(
-      "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align "
-      "8 noalias %P1, i32* %P2)\n",
+      "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align "
+      "8 noalias %P1, ptr %P2)\n",
       [](Instruction *I) {
         auto *Assume = buildAssumeFromInst(I);
         Assume->insertBefore(I->getIterator());
@@ -507,11 +507,11 @@ TEST(AssumeQueryAPI, AssumptionCache) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(
       "declare void @llvm.assume(i1)\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3, i1 %B) {\n"
-      "call void @llvm.assume(i1 true) [\"nonnull\"(i32* %P), \"align\"(i32* "
-      "%P2, i32 4), \"align\"(i32* %P, i32 8)]\n"
-      "call void @llvm.assume(i1 %B) [\"test\"(i32* %P1), "
-      "\"dereferenceable\"(i32* %P, i32 4)]\n"
+      "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3, i1 %B) {\n"
+      "call void @llvm.assume(i1 true) [\"nonnull\"(ptr %P), \"align\"(ptr "
+      "%P2, i32 4), \"align\"(ptr %P, i32 8)]\n"
+      "call void @llvm.assume(i1 %B) [\"test\"(ptr %P1), "
+      "\"dereferenceable\"(ptr %P, i32 4)]\n"
       "ret void\n}\n",
       Err, C);
   if (!Mod)
@@ -569,11 +569,11 @@ TEST(AssumeQueryAPI, Alignment) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(
       "declare void @llvm.assume(i1)\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32 %I3, i1 %B) {\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P, i32 8, i32 %I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P1, i32 %I3, i32 "
+      "define void @test(ptr %P, ptr %P1, ptr %P2, i32 %I3, i1 %B) {\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P, i32 8, i32 %I3)]\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P1, i32 %I3, i32 "
       "%I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P2, i32 16, i32 8)]\n"
+      "call void @llvm.assume(i1 true) [\"align\"(ptr %P2, i32 16, i32 8)]\n"
       "ret void\n}\n",
       Err, C);
   if (!Mod)
diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index 17240a1c73bce..bf5afe8e79354 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -1936,26 +1936,26 @@ TEST_F(CGSCCPassManagerTest, TestDeletionOfFunctionInNonTrivialRefSCC) {
 TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) {
   std::unique_ptr<Module> M = parseIR("define void @f1() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f4 to i8*\n"
-                                      "  %b = bitcast void ()* @f2 to i8*\n"
+                                      "  %a = bitcast ptr @f4 to ptr\n"
+                                      "  %b = bitcast ptr @f2 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f2() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f1 to i8*\n"
-                                      "  %b = bitcast void ()* @f3 to i8*\n"
+                                      "  %a = bitcast ptr @f1 to ptr\n"
+                                      "  %b = bitcast ptr @f3 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f3() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f2 to i8*\n"
-                                      "  %b = bitcast void ()* @f4 to i8*\n"
+                                      "  %a = bitcast ptr @f2 to ptr\n"
+                                      "  %b = bitcast ptr @f4 to ptr\n"
                                       "  ret void\n"
                                       "}\n"
                                       "define void @f4() {\n"
                                       "entry:\n"
-                                      "  %a = bitcast void ()* @f3 to i8*\n"
-                                      "  %b = bitcast void ()* @f1 to i8*\n"
+                                      "  %a = bitcast ptr @f3 to ptr\n"
+                                      "  %b = bitcast ptr @f1 to ptr\n"
                                       "  ret void\n"
                                       "}\n");
 
diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp
index ea3f21efc014c..d7ee5252d50be 100644
--- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp
+++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp
@@ -20,27 +20,27 @@ using namespace llvm;
 TEST(CaptureTracking, MaxUsesToExplore) {
   StringRef Assembly = R"(
     ; Function Attrs: nounwind ssp uwtable
-    declare void @doesnt_capture(i8* nocapture, i8* nocapture, i8* nocapture, 
-                                 i8* nocapture, i8* nocapture)
+    declare void @doesnt_capture(ptr nocapture, ptr nocapture, ptr nocapture,
+                                 ptr nocapture, ptr nocapture)
 
     ; %arg has 5 uses
-    define void @test_few_uses(i8* %arg) {
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
+    define void @test_few_uses(ptr %arg) {
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
       ret void
     }
 
     ; %arg has 50 uses
-    define void @test_many_uses(i8* %arg) {
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
-      call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg)
+    define void @test_many_uses(ptr %arg) {
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
+      call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg)
       ret void
     }
   )";
@@ -85,12 +85,12 @@ struct CollectingCaptureTracker : public CaptureTracker {
 
 TEST(CaptureTracking, MultipleUsesInSameInstruction) {
   StringRef Assembly = R"(
-    declare void @call(i8*, i8*, i8*)
+    declare void @call(ptr, ptr, ptr)
 
-    define void @test(i8* %arg, i8** %ptr) {
-      call void @call(i8* %arg, i8* nocapture %arg, i8* %arg) [ "bundle"(i8* %arg) ]
-      cmpxchg i8** %ptr, i8* %arg, i8* %arg acq_rel monotonic
-      icmp eq i8* %arg, %arg
+    define void @test(ptr %arg, ptr %ptr) {
+      call void @call(ptr %arg, ptr nocapture %arg, ptr %arg) [ "bundle"(ptr %arg) ]
+      cmpxchg ptr %ptr, ptr %arg, ptr %arg acq_rel monotonic
+      icmp eq ptr %arg, %arg
       ret void
     }
   )";
diff --git a/llvm/unittests/Analysis/DDGTest.cpp b/llvm/unittests/Analysis/DDGTest.cpp
index 7fcdfdb62da43..12944a3f0cf3f 100644
--- a/llvm/unittests/Analysis/DDGTest.cpp
+++ b/llvm/unittests/Analysis/DDGTest.cpp
@@ -51,7 +51,7 @@ TEST(DDGTest, getDependencies) {
       "target datalayout = \"e-m:e-i64:64-n32:64\"\n"
       "target triple = \"powerpc64le-unknown-linux-gnu\"\n"
       "\n"
-      "define dso_local void @foo(i32 signext %n, i32* noalias %A, i32* "
+      "define dso_local void @foo(i32 signext %n, ptr noalias %A, ptr "
       "noalias %B) {\n"
       "entry:\n"
       "   %cmp1 = icmp sgt i32 %n, 0\n"
@@ -64,16 +64,16 @@ TEST(DDGTest, getDependencies) {
       " for.body:\n"
       "   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ "
       "%indvars.iv.next, %for.body ]\n"
-      "   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n"
+      "   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\n"
       "  %0 = trunc i64 %indvars.iv to i32\n"
-      "  store i32 %0, i32* %arrayidx, align 4\n"
+      "  store i32 %0, ptr %arrayidx, align 4\n"
       "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
-      "  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 "
+      "  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 "
       "%indvars.iv.next\n"
-      "  %1 = load i32, i32* %arrayidx2, align 4\n"
+      "  %1 = load i32, ptr %arrayidx2, align 4\n"
       "  %add3 = add nsw i32 %1, 1\n"
-      "  %arrayidx5 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv\n"
-      "  store i32 %add3, i32* %arrayidx5, align 4\n"
+      "  %arrayidx5 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv\n"
+      "  store i32 %add3, ptr %arrayidx5, align 4\n"
       "  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n"
       "  br i1 %exitcond, label %for.body, label %for.end.loopexit\n"
       "\n"
@@ -142,8 +142,8 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
   const char *ModuleStr =
       "target datalayout = \"e-m:e-i64:64-n32:64-v256:256:256-v512:512:512\"\n"
       "\n"
-      "define void @foo(float* noalias %A, float* noalias %B, float* noalias "
-      "%C, float* noalias %D, i32 signext %n) {\n"
+      "define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias "
+      "%C, ptr noalias %D, i32 signext %n) {\n"
       "entry:\n"
       "  %cmp1 = icmp sgt i32 %n, 0\n"
       "  br i1 %cmp1, label %for.body.preheader, label %for.end\n"
@@ -156,26 +156,26 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
       "%for.body.preheader, %if.end\n"
       "  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, "
       "%if.end ]\n"
-      "  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv\n"
-      "  %loadASubI = load float, float* %arrayidx, align 4\n"
-      "  %arrayidx2 = getelementptr inbounds float, float* %B, i64 "
+      "  %arrayidx = getelementptr inbounds float, ptr %A, i64 %indvars.iv\n"
+      "  %loadASubI = load float, ptr %arrayidx, align 4\n"
+      "  %arrayidx2 = getelementptr inbounds float, ptr %B, i64 "
       "%indvars.iv\n"
-      "  %loadBSubI = load float, float* %arrayidx2, align 4\n"
+      "  %loadBSubI = load float, ptr %arrayidx2, align 4\n"
       "  %add = fadd fast float %loadASubI, %loadBSubI\n"
-      "  %arrayidx4 = getelementptr inbounds float, float* %A, i64 "
+      "  %arrayidx4 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  store float %add, float* %arrayidx4, align 4\n"
-      "  %arrayidx6 = getelementptr inbounds float, float* %A, i64 "
+      "  store float %add, ptr %arrayidx4, align 4\n"
+      "  %arrayidx6 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  %0 = load float, float* %arrayidx6, align 4\n"
+      "  %0 = load float, ptr %arrayidx6, align 4\n"
       "  %add7 = fadd fast float %0, 1.000000e+00\n"
       "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
-      "  %arrayidx10 = getelementptr inbounds float, float* %B, i64 "
+      "  %arrayidx10 = getelementptr inbounds float, ptr %B, i64 "
       "%indvars.iv.next\n"
-      "  store float %add7, float* %arrayidx10, align 4\n"
-      "  %arrayidx12 = getelementptr inbounds float, float* %A, i64 "
+      "  store float %add7, ptr %arrayidx10, align 4\n"
+      "  %arrayidx12 = getelementptr inbounds float, ptr %A, i64 "
       "%indvars.iv\n"
-      "  %1 = load float, float* %arrayidx12, align 4\n"
+      "  %1 = load float, ptr %arrayidx12, align 4\n"
       "  %cmp13 = fcmp fast ogt float %1, 1.000000e+02\n"
       "  br i1 %cmp13, label %if.then, label %if.else\n"
       "\n"
@@ -188,7 +188,7 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) {
       "if.end:                                           ; preds = %if.else, "
       "%if.then\n"
       "  %ff.0 = phi float [ %add, %if.then ], [ %add7, %if.else ]\n"
-      "  store float %ff.0, float* %C, align 4\n"
+      "  store float %ff.0, ptr %C, align 4\n"
       "  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n"
       "  br i1 %exitcond, label %for.body, label %for.end.loopexit\n"
       "\n"
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index 497da8f3fc70b..dc5d0a8a7ca9b 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -457,7 +457,7 @@ define internal void @callee() {
   ret void
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @callee()
       to label %cont unwind label %exc
@@ -466,7 +466,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -498,7 +498,7 @@ TEST_F(FunctionPropertiesAnalysisTest, InvokeUnreachableHandler) {
                                              R"IR(
 declare void @might_throw()
 
-define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
+define internal i32 @callee() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @might_throw()
       to label %cont unwind label %exc
@@ -507,12 +507,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
-  resume { i8*, i32 } %exn
+  resume { ptr, i32 } %exn
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   %X = invoke i32 @callee()
            to label %cont unwind label %Handler
@@ -521,7 +521,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 %X
 
 Handler:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -554,7 +554,7 @@ TEST_F(FunctionPropertiesAnalysisTest, Rethrow) {
                                              R"IR(
 declare void @might_throw()
 
-define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
+define internal i32 @callee() personality ptr @__gxx_personality_v0 {
 entry:
   invoke void @might_throw()
       to label %cont unwind label %exc
@@ -563,12 +563,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 0
 
 exc:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
-  resume { i8*, i32 } %exn
+  resume { ptr, i32 } %exn
 }
 
-define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+define i32 @caller() personality ptr @__gxx_personality_v0 {
 entry:
   %X = invoke i32 @callee()
            to label %cont unwind label %Handler
@@ -577,7 +577,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
   ret i32 %X
 
 Handler:
-  %exn = landingpad {i8*, i32}
+  %exn = landingpad {ptr, i32}
          cleanup
   ret i32 1
 }
@@ -612,18 +612,18 @@ declare void @external_func()
 @exception_type2 = external global i8
 
 
-define internal void @inner() personality i8* null {
+define internal void @inner() personality ptr null {
   invoke void @external_func()
       to label %cont unwind label %lpad
 cont:
   ret void
 lpad:
   %lp = landingpad i32
-      catch i8* @exception_type1
+      catch ptr @exception_type1
   resume i32 %lp
 }
 
-define void @outer() personality i8* null {
+define void @outer() personality ptr null {
   invoke void @inner()
       to label %cont unwind label %lpad
 cont:
@@ -631,7 +631,7 @@ define void @outer() personality i8* null {
 lpad:
   %lp = landingpad i32
       cleanup
-      catch i8* @exception_type2
+      catch ptr @exception_type2
   resume i32 %lp
 }
 
@@ -666,18 +666,18 @@ declare void @external_func()
 @exception_type2 = external global i8
 
 
-define internal void @inner() personality i8* null {
+define internal void @inner() personality ptr null {
   invoke void @external_func()
       to label %cont unwind label %lpad
 cont:
   ret void
 lpad:
   %lp = landingpad i32
-      catch i8* @exception_type1
+      catch ptr @exception_type1
   resume i32 %lp
 }
 
-define void @outer(i32 %a) personality i8* null {
+define void @outer(i32 %a) personality ptr null {
 entry:
   %i = icmp slt i32 %a, 0
   br i1 %i, label %if.then, label %cont
@@ -689,7 +689,7 @@ if.then:
 lpad:
   %lp = landingpad i32
       cleanup
-      catch i8* @exception_type2
+      catch ptr @exception_type2
   resume i32 %lp
 }
 
@@ -931,9 +931,9 @@ TEST_F(FunctionPropertiesAnalysisTest, DetailedOperandCount) {
 @a = global i64 1
 
 define i64 @f1(i64 %e) {
-	%b = load i64, i64* @a
+	%b = load i64, ptr @a
   %c = add i64 %b, 2
-  %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c)																						
+  %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c)
 	%f = add i64 %d, %e
 	ret i64 %f
 }
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 4a4ff3242c4c4..5c0bfbd74555e 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -142,78 +142,78 @@ static const char DiamondOfTriangles[] =
 static const char DiamondOfTrianglesRefGraph[] =
      "define void @a1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a2, void ()** %a\n"
-     "  store void ()* @b2, void ()** %a\n"
-     "  store void ()* @c3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a2, ptr %a\n"
+     "  store ptr @b2, ptr %a\n"
+     "  store ptr @c3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @a2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @a3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @a1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @a1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b2, void ()** %a\n"
-     "  store void ()* @d3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b2, ptr %a\n"
+     "  store ptr @d3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @b3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @b1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @b1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c2, void ()** %a\n"
-     "  store void ()* @d2, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c2, ptr %a\n"
+     "  store ptr @d2, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @c3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @c1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @c1, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d1() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d2, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d2, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d2() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d3, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d3, ptr %a\n"
      "  ret void\n"
      "}\n"
      "define void @d3() {\n"
      "entry:\n"
-     "  %a = alloca void ()*\n"
-     "  store void ()* @d1, void ()** %a\n"
+     "  %a = alloca ptr\n"
+     "  store ptr @d1, ptr %a\n"
      "  ret void\n"
      "}\n";
 
@@ -1005,20 +1005,20 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
   std::unique_ptr<Module> M =
       parseAssembly(Context, "define void @a() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @b, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @b, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @b() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @c, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @c, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @c() {\n"
                              "entry:\n"
-                             "  %p = alloca void ()*\n"
-                             "  store void ()* @d, void ()** %p\n"
+                             "  %p = alloca ptr\n"
+                             "  store ptr @d, ptr %p\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
@@ -1306,25 +1306,25 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
   LLVMContext Context;
   // A nice fully connected (including self-edges) RefSCC.
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1384,25 +1384,25 @@ TEST(LazyCallGraphTest, InternalMultiEdgeRemoval) {
   LLVMContext Context;
   // A nice fully connected (including self-edges) RefSCC.
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  store ptr @b, ptr %ptr\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1454,22 +1454,22 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
   // Reference edges: a -> b -> c -> a
   //      Call edges: a -> c -> b -> a
   std::unique_ptr<Module> M = parseAssembly(
-      Context, "define void @a(i8** %ptr) {\n"
+      Context, "define void @a(ptr %ptr) {\n"
                "entry:\n"
-               "  call void @b(i8** %ptr)\n"
-               "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+               "  call void @b(ptr %ptr)\n"
+               "  store ptr @c, ptr %ptr\n"
                "  ret void\n"
                "}\n"
-               "define void @b(i8** %ptr) {\n"
+               "define void @b(ptr %ptr) {\n"
                "entry:\n"
-               "  store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n"
-               "  call void @c(i8** %ptr)\n"
+               "  store ptr @a, ptr %ptr\n"
+               "  call void @c(ptr %ptr)\n"
                "  ret void\n"
                "}\n"
-               "define void @c(i8** %ptr) {\n"
+               "define void @c(ptr %ptr) {\n"
                "entry:\n"
-               "  call void @a(i8** %ptr)\n"
-               "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+               "  call void @a(ptr %ptr)\n"
+               "  store ptr @b, ptr %ptr\n"
                "  ret void\n"
                "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1622,24 +1622,24 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) {
                              "entry:\n"
                              "  call void @b()\n"
                              "  call void @c()\n"
-                             "  store void()* @d, void()** undef\n"
+                             "  store ptr @d, ptr undef\n"
                              "  ret void\n"
                              "}\n"
                              "define void @b() {\n"
                              "entry:\n"
-                             "  store void()* @c, void()** undef\n"
+                             "  store ptr @c, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @c() {\n"
                              "entry:\n"
-                             "  store void()* @b, void()** undef\n"
+                             "  store ptr @b, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1745,13 +1745,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) {
                              "}\n"
                              "define void @c3() {\n"
                              "entry:\n"
-                             "  store void()* @b1, void()** undef\n"
+                             "  store ptr @b1, ptr undef\n"
                              "  call void @d()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @d() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1875,13 +1875,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
                              "}\n"
                              "define void @f() {\n"
                              "entry:\n"
-                             "  store void()* @b, void()** undef\n"
+                             "  store ptr @b, ptr undef\n"
                              "  call void @g()\n"
                              "  ret void\n"
                              "}\n"
                              "define void @g() {\n"
                              "entry:\n"
-                             "  store void()* @a, void()** undef\n"
+                             "  store ptr @a, ptr undef\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1962,9 +1962,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
                              "bb:\n"
                              "  unreachable\n"
                              "}\n"
-                             "define void @g(i8** %ptr) {\n"
+                             "define void @g(ptr %ptr) {\n"
                              "entry:\n"
-                             "  store i8* blockaddress(@f, %bb), i8** %ptr\n"
+                             "  store ptr blockaddress(@f, %bb), ptr %ptr\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -1991,9 +1991,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress2) {
       parseAssembly(Context, "define void @f() {\n"
                              "  ret void\n"
                              "}\n"
-                             "define void @g(i8** %ptr) {\n"
+                             "define void @g(ptr %ptr) {\n"
                              "bb:\n"
-                             "  store i8* blockaddress(@g, %bb), i8** %ptr\n"
+                             "  store ptr blockaddress(@g, %bb), ptr %ptr\n"
                              "  ret void\n"
                              "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -2018,31 +2018,31 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) {
   // function.
   std::unique_ptr<Module> M =
       parseAssembly(Context,
-                    "define void @a(i8** %ptr) {\n"
+                    "define void @a(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @b(i8** %ptr) {\n"
+                    "define void @b(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
-                    "  call void @d(i8** %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
+                    "  call void @d(ptr %ptr)"
                     "  ret void\n"
                     "}\n"
-                    "define void @c(i8** %ptr) {\n"
+                    "define void @c(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @d(i8** %ptr)"
-                    "  call void @d(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  call void @d(ptr %ptr)"
+                    "  call void @d(ptr %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @d(i8** %ptr) {\n"
+                    "define void @d(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
-                    "  call void @c(i8** %ptr)"
-                    "  call void @d(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @b, ptr %ptr\n"
+                    "  call void @c(ptr %ptr)"
+                    "  call void @d(ptr %ptr)"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n");
   LazyCallGraph CG = buildCG(*M);
@@ -2098,25 +2098,25 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRef) {
   // A graph with a couple of RefSCCs.
   std::unique_ptr<Module> M =
       parseAssembly(Context,
-                    "define void @a(i8** %ptr) {\n"
+                    "define void @a(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store ptr @d, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @b(i8** %ptr) {\n"
+                    "define void @b(ptr %ptr) {\n"
                     "entry:\n"
-                    "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+                    "  store ptr @c, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
-                    "define void @c(i8** %ptr) {\n"
+                    "define void @c(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @d(i8** %ptr)"
+                    "  call void @d(ptr %ptr)"
                     "  ret void\n"
                     "}\n"
-                    "define void @d(i8** %ptr) {\n"
+                    "define void @d(ptr %ptr) {\n"
                     "entry:\n"
-                    "  call void @c(i8** %ptr)"
-                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+                    "  call void @c(ptr %ptr)"
+                    "  store ptr @b, ptr %ptr\n"
                     "  ret void\n"
                     "}\n"
                     "define void @dead() {\n"
@@ -2965,7 +2965,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions5) {
   LLVMContext Context;
   std::unique_ptr<Module> M =
       parseAssembly(Context, "define void @f() {\n"
-                             "  %1 = bitcast void ()* @f2 to i8*\n"
+                             "  %1 = bitcast ptr @f2 to ptr\n"
                              "  ret void\n"
                              "}\n"
                              "define void @f2() {\n"
diff --git a/llvm/unittests/Analysis/SparsePropagation.cpp b/llvm/unittests/Analysis/SparsePropagation.cpp
index ca73a480cbb2d..0cbf5de81c808 100644
--- a/llvm/unittests/Analysis/SparsePropagation.cpp
+++ b/llvm/unittests/Analysis/SparsePropagation.cpp
@@ -357,9 +357,9 @@ TEST_F(SparsePropagationTest, GlobalVariableOverDefined) {
 
 /// Test that we propagate information through function returns.
 ///
-/// define internal i64 @f(i1* %cond) {
+/// define internal i64 @f(ptr %cond) {
 /// if:
-///   %0 = load i1, i1* %cond
+///   %0 = load i1, ptr %cond
 ///   br i1 %0, label %then, label %else
 ///
 /// then:
@@ -397,9 +397,9 @@ TEST_F(SparsePropagationTest, FunctionDefined) {
 
 /// Test that we propagate information through function returns.
 ///
-/// define internal i64 @f(i1* %cond) {
+/// define internal i64 @f(ptr %cond) {
 /// if:
-///   %0 = load i1, i1* %cond
+///   %0 = load i1, ptr %cond
 ///   br i1 %0, label %then, label %else
 ///
 /// then:
diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
index d5ba1757ce35c..3c7ee7ad1334e 100644
--- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
+++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
@@ -214,18 +214,18 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
       "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
       "define void @ptr_cmp(i8 *%a) {\n"
       "entry:\n"
-      "  %limit = getelementptr i8, i8* %a, i64 40\n"
-      "  %start.iv2 = getelementptr i8, i8* %a, i64 7\n"
+      "  %limit = getelementptr i8, ptr %a, i64 40\n"
+      "  %start.iv2 = getelementptr i8, ptr %a, i64 7\n"
       "  br label %loop.body\n"
       "loop.body:\n"
-      "  %iv.0 = phi i8* [ %a, %entry ], [ %iv.1, %loop.body ]\n"
-      "  %iv2.0 = phi i8* [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n"
-      "  %cmp = icmp eq i8* %iv2.0, %iv.0\n"
-      "  %cmp2 = icmp slt i8* %iv2.0, %iv.0\n"
-      "  %cmp3 = icmp ult i8* %iv2.0, %iv.0\n"
-      "  %iv.1 = getelementptr inbounds i8, i8* %iv.0, i64 1\n"
-      "  %iv2.1 = getelementptr inbounds i8, i8* %iv2.0, i64 1\n"
-      "  %exitcond = icmp ne i8* %iv.1, %limit\n"
+      "  %iv.0 = phi ptr [ %a, %entry ], [ %iv.1, %loop.body ]\n"
+      "  %iv2.0 = phi ptr [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n"
+      "  %cmp = icmp eq ptr %iv2.0, %iv.0\n"
+      "  %cmp2 = icmp slt ptr %iv2.0, %iv.0\n"
+      "  %cmp3 = icmp ult ptr %iv2.0, %iv.0\n"
+      "  %iv.1 = getelementptr inbounds i8, ptr %iv.0, i64 1\n"
+      "  %iv2.1 = getelementptr inbounds i8, ptr %iv2.0, i64 1\n"
+      "  %exitcond = icmp ne ptr %iv.1, %limit\n"
       "  br i1 %exitcond, label %loop.body, label %loop.exit\n"
       "loop.exit:\n"
       "  ret void\n"
@@ -248,14 +248,14 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
   Instruction *Cmp2 = &*BBI++;
   Instruction *Cmp3 = &*BBI++;
   // Check simplification expected on the 5th iteration.
-  // Check that "%cmp = icmp eq i8* %iv2.0, %iv.0" is simplified to 0.
+  // Check that "%cmp = icmp eq ptr %iv2.0, %iv.0" is simplified to 0.
   auto I1 = SimplifiedValuesVector[5].find(Cmp1);
   EXPECT_TRUE(I1 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I1).second)->getZExtValue(), 0U);
-  // Check that "%cmp2 = icmp slt i8* %iv2.0, %iv.0" does not simplify
+  // Check that "%cmp2 = icmp slt ptr %iv2.0, %iv.0" does not simplify
   auto I2 = SimplifiedValuesVector[5].find(Cmp2);
   EXPECT_TRUE(I2 == SimplifiedValuesVector[5].end());
-  // Check that "%cmp3 = icmp ult i8* %iv2.0, %iv.0" is simplified to 0.
+  // Check that "%cmp3 = icmp ult ptr %iv2.0, %iv.0" is simplified to 0.
   auto I3 = SimplifiedValuesVector[5].find(Cmp3);
   EXPECT_TRUE(I3 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I1).second)->getZExtValue(), 0U);
@@ -271,8 +271,8 @@ TEST(UnrollAnalyzerTest, CastSimplifications) {
       "\n"
       "loop:\n"
       "  %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]\n"
-      "  %array_const_idx = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv\n"
-      "  %const_array_element = load i32, i32* %array_const_idx, align 4\n"
+      "  %array_const_idx = getelementptr inbounds [10 x i32], ptr @known_constant, i64 0, i64 %iv\n"
+      "  %const_array_element = load i32, ptr %array_const_idx, align 4\n"
       "  %se = sext i32 %const_array_element to i64\n"
       "  %ze = zext i32 %const_array_element to i64\n"
       "  %tr = trunc i32 %const_array_element to i8\n"
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index bb0280ee69cfd..89a0faefc5988 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2697,9 +2697,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRange) {
   parseAssembly(
       "define void @test(ptr %p) {\n"
       "  %A = load i64, ptr %p, !range !{i64 64, i64 65536}\n"
-      "  %APtr = inttoptr i64 %A to float*"
-      "  %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n"
-      "  %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n"
+      "  %APtr = inttoptr i64 %A to ptr"
+      "  %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n"
+      "  %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n"
       "  call void @llvm.assume(i1 %c)\n"
       "  ret void\n"
       "}\n"
@@ -2730,9 +2730,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRangeNoOverlap) {
   parseAssembly(
       "define void @test(ptr %p) {\n"
       "  %A = load i64, ptr %p, !range !{i64 32, i64 64}\n"
-      "  %APtr = inttoptr i64 %A to float*"
-      "  %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n"
-      "  %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n"
+      "  %APtr = inttoptr i64 %A to ptr"
+      "  %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n"
+      "  %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n"
       "  call void @llvm.assume(i1 %c)\n"
       "  ret void\n"
       "}\n"
diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp
index 587551246c4f4..8c3637704fc71 100644
--- a/llvm/unittests/MIR/MachineMetadata.cpp
+++ b/llvm/unittests/MIR/MachineMetadata.cpp
@@ -205,8 +205,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAArch64) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
@@ -354,8 +354,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerX64) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
@@ -446,8 +446,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAMDGPU) {
 
   StringRef MIRString = R"MIR(
 --- |
-  define i32 @test0(i32* %p) {
-    %r = load i32, i32* %p, align 4
+  define i32 @test0(ptr %p) {
+    %r = load i32, ptr %p, align 4
     ret i32 %r
   }
 ...
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index d7f2908bb079b..c36ed93a20cef 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -75,7 +75,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST0) {
 
   declare i32 @__gxx_wasm_personality_v0(...)
 
-  define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+  define void @test0() personality ptr @__gxx_wasm_personality_v0 {
     unreachable
   }
 
@@ -237,7 +237,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST1) {
 
   declare i32 @__gxx_wasm_personality_v0(...)
 
-  define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+  define void @test1() personality ptr @__gxx_wasm_personality_v0 {
     unreachable
   }
 
diff --git a/llvm/unittests/Transforms/IPO/AttributorTest.cpp b/llvm/unittests/Transforms/IPO/AttributorTest.cpp
index e442dae9aa3ad..e345c60f781d2 100644
--- a/llvm/unittests/Transforms/IPO/AttributorTest.cpp
+++ b/llvm/unittests/Transforms/IPO/AttributorTest.cpp
@@ -78,17 +78,17 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
   const char *ModuleString = R"(
     @x = external global i32
     define void @func4() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
     define internal void @func3() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
     define internal void @func8() {
-      store i32 0, i32* @x
+      store i32 0, ptr @x
       ret void
     }
 
@@ -105,7 +105,7 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
     }
 
     declare void @unknown()
-    define internal void @func5(void ()* %ptr) {
+    define internal void @func5(ptr %ptr) {
     entry:
       call void %ptr()
       call void @unknown()
@@ -114,8 +114,8 @@ TEST_F(AttributorTestBase, AAReachabilityTest) {
 
     define void @func6() {
     entry:
-      store i32 0, i32* @x
-      call void @func5(void ()* @func3)
+      store i32 0, ptr @x
+      call void @func5(ptr @func3)
       ret void
     }
 
diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
index 98a69bbb47de1..a193993ba04d6 100644
--- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
@@ -37,13 +37,13 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
 
   SMDiagnostic Error;
   StringRef Text = R"(
-    define void @foo(i64* %ptr) {
+    define void @foo(ptr %ptr) {
     entry:
       br label %loop
 
     loop:
       %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ]
-      %n = load i64, i64* %ptr, !invariant.load !0
+      %n = load i64, ptr %ptr, !invariant.load !0
       %iv.inc = add i64 %iv, 1
       %cmp = icmp ult i64 %iv.inc, %n
       br i1 %cmp, label %loop, label %exit
@@ -62,17 +62,17 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
   BasicBlock &EntryBB = F->getEntryBlock();
   BasicBlock *LoopBB = EntryBB.getUniqueSuccessor();
 
-  // Select `load i64, i64* %ptr`.
+  // Select `load i64, ptr %ptr`.
   Instruction *IBefore = &*LoopBB->getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IBefore));
-  // Upon this query SCEV caches disposition of <load i64, i64* %ptr> SCEV.
+  // Upon this query SCEV caches disposition of <load i64, ptr %ptr> SCEV.
   ASSERT_EQ(SE.getBlockDisposition(SE.getSCEV(IBefore), LoopBB),
             ScalarEvolution::BlockDisposition::DominatesBlock);
 
   MPM.run(*M, MAM);
 
-  // Select `load i64, i64* %ptr` after it was hoisted.
+  // Select `load i64, ptr %ptr` after it was hoisted.
   Instruction *IAfter = &*EntryBB.getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IAfter));
@@ -84,7 +84,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
       SE.getBlockDisposition(SE.getSCEV(IAfter), LoopBB);
 
   // If LICM have properly invalidated SCEV,
-  //   1. SCEV of <load i64, i64* %ptr> should properly dominate the "loop" BB,
+  //   1. SCEV of <load i64, ptr %ptr> should properly dominate the "loop" BB,
   //   2. extra invalidation shouldn't change result of the query.
   EXPECT_EQ(DispositionBeforeInvalidation,
             ScalarEvolution::BlockDisposition::ProperlyDominatesBlock);
diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index cb3d1001e4110..88eaa875a803a 100644
--- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -265,21 +265,21 @@ class LoopPassManagerTest : public ::testing::Test {
 public:
   LoopPassManagerTest()
       : M(parseIR(Context,
-                  "define void @f(i1* %ptr) {\n"
+                  "define void @f(ptr %ptr) {\n"
                   "entry:\n"
                   "  br label %loop.0\n"
                   "loop.0:\n"
-                  "  %cond.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                   "loop.0.0.ph:\n"
                   "  br label %loop.0.0\n"
                   "loop.0.0:\n"
-                  "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                   "loop.0.1.ph:\n"
                   "  br label %loop.0.1\n"
                   "loop.0.1:\n"
-                  "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.latch\n"
                   "loop.0.latch:\n"
                   "  br label %loop.0\n"
@@ -287,11 +287,11 @@ class LoopPassManagerTest : public ::testing::Test {
                   "  ret void\n"
                   "}\n"
                   "\n"
-                  "define void @g(i1* %ptr) {\n"
+                  "define void @g(ptr %ptr) {\n"
                   "entry:\n"
                   "  br label %loop.g.0\n"
                   "loop.g.0:\n"
-                  "  %cond.0 = load volatile i1, i1* %ptr\n"
+                  "  %cond.0 = load volatile i1, ptr %ptr\n"
                   "  br i1 %cond.0, label %loop.g.0, label %end\n"
                   "end:\n"
                   "  ret void\n"
@@ -861,26 +861,26 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
 
 TEST_F(LoopPassManagerTest, LoopChildInsertion) {
   // Super boring module with three loops in a single loop nest.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                        "loop.0.1.ph:\n"
                        "  br label %loop.0.1\n"
                        "loop.0.1:\n"
-                       "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n"
                        "loop.0.latch:\n"
                        "  br label %loop.0\n"
@@ -1064,28 +1064,28 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) {
 TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
   // Super boring module with two loop nests and loop nest with two child
   // loops.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %loop.2.ph\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n"
                        "loop.0.latch:\n"
                        "  br label %loop.0\n"
                        "loop.2.ph:\n"
                        "  br label %loop.2\n"
                        "loop.2:\n"
-                       "  %cond.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.2, label %loop.2, label %end\n"
                        "end:\n"
                        "  ret void\n"
@@ -1318,31 +1318,31 @@ TEST_F(LoopPassManagerTest, LoopDeletion) {
   // Build a module with a single loop nest that contains one outer loop with
   // three subloops, and one of those with its own subloop. We will
   // incrementally delete all of these to test different deletion scenarios.
-  M = parseIR(Context, "define void @f(i1* %ptr) {\n"
+  M = parseIR(Context, "define void @f(ptr %ptr) {\n"
                        "entry:\n"
                        "  br label %loop.0\n"
                        "loop.0:\n"
-                       "  %cond.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0, label %loop.0.0.ph, label %end\n"
                        "loop.0.0.ph:\n"
                        "  br label %loop.0.0\n"
                        "loop.0.0:\n"
-                       "  %cond.0.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n"
                        "loop.0.1.ph:\n"
                        "  br label %loop.0.1\n"
                        "loop.0.1:\n"
-                       "  %cond.0.1 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.1 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n"
                        "loop.0.2.ph:\n"
                        "  br label %loop.0.2\n"
                        "loop.0.2:\n"
-                       "  %cond.0.2 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2, label %loop.0.2.0.ph, label %loop.0.latch\n"
                        "loop.0.2.0.ph:\n"
                        "  br label %loop.0.2.0\n"
                        "loop.0.2.0:\n"
-                       "  %cond.0.2.0 = load volatile i1, i1* %ptr\n"
+                       "  %cond.0.2.0 = load volatile i1, ptr %ptr\n"
                        "  br i1 %cond.0.2.0, label %loop.0.2.0, label %loop.0.2.latch\n"
                        "loop.0.2.latch:\n"
                        "  br label %loop.0.2\n"
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 4235c93f275f0..00d9e9ff81e05 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -484,9 +484,9 @@ while.body:
 TEST(BasicBlockUtils, SplitIndirectBrCriticalEdgesIgnorePHIs) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"IR(
-define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
+define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) {
 entry:
-  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
+  indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2]
 bb0:
   br i1 %cond0, label %bb1, label %bb2
 bb1:
@@ -526,9 +526,9 @@ define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
 TEST(BasicBlockUtils, SplitIndirectBrCriticalEdges) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"IR(
-define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
+define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) {
 entry:
-  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
+  indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2]
 bb0:
   br i1 %cond0, label %bb1, label %bb2
 bb1:
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index d990808d31fe2..237bc6e873f94 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -394,7 +394,7 @@ TEST(CloneLoop, CloneLoopNest) {
 
   std::unique_ptr<Module> M = parseIR(
     Context,
-    R"(define void @foo(i32* %A, i32 %ub) {
+    R"(define void @foo(ptr %A, i32 %ub) {
 entry:
   %guardcmp = icmp slt i32 0, %ub
   br i1 %guardcmp, label %for.outer.preheader, label %for.end
@@ -408,8 +408,8 @@ for.inner.preheader:
 for.inner:
   %i = phi i32 [ 0, %for.inner.preheader ], [ %inc, %for.inner ]
   %idxprom = sext i32 %i to i64
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
-  store i32 %i, i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %idxprom
+  store i32 %i, ptr %arrayidx, align 4
   %inc = add nsw i32 %i, 1
   %cmp = icmp slt i32 %inc, %ub
   br i1 %cmp, label %for.inner, label %for.inner.exit
@@ -728,10 +728,10 @@ TEST(CloneFunction, CloneEmptyFunction) {
 
 TEST(CloneFunction, CloneFunctionWithInalloca) {
   StringRef ImplAssembly = R"(
-    declare void @a(i32* inalloca(i32))
+    declare void @a(ptr inalloca(i32))
     define void @foo() {
       %a = alloca inalloca i32
-      call void @a(i32* inalloca(i32) %a)
+      call void @a(ptr inalloca(i32) %a)
       ret void
     }
     declare void @bar()
diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 9ea8de3da1e5b..90f06204ec9b3 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -154,13 +154,13 @@ TEST(CodeExtractor, ExitBlockOrderingPhis) {
       %0 = alloca i32, align 4
       br label %test0
     test0:
-      %c = load i32, i32* %0, align 4
+      %c = load i32, ptr %0, align 4
       br label %test1
     test1:
-      %e = load i32, i32* %0, align 4
+      %e = load i32, ptr %0, align 4
       br i1 true, label %first, label %test
     test:
-      %d = load i32, i32* %0, align 4
+      %d = load i32, ptr %0, align 4
       br i1 true, label %first, label %next
     first:
       %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
@@ -212,13 +212,13 @@ TEST(CodeExtractor, ExitBlockOrdering) {
       %0 = alloca i32, align 4
       br label %test0
     test0:
-      %c = load i32, i32* %0, align 4
+      %c = load i32, ptr %0, align 4
       br label %test1
     test1:
-      %e = load i32, i32* %0, align 4
+      %e = load i32, ptr %0, align 4
       br i1 true, label %first, label %test
     test:
-      %d = load i32, i32* %0, align 4
+      %d = load i32, ptr %0, align 4
       br i1 true, label %first, label %next
     first:
       ret void
@@ -317,7 +317,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
     declare i8 @hoge()
 
-    define i32 @foo() personality i8* null {
+    define i32 @foo() personality ptr null {
       entry:
         %call = invoke i8 @hoge()
                 to label %invoke.cont unwind label %lpad
@@ -326,8 +326,8 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
         unreachable
 
       lpad:                                             ; preds = %entry
-        %0 = landingpad { i8*, i32 }
-                catch i8* null
+        %0 = landingpad { ptr, i32 }
+                catch ptr null
         br i1 undef, label %catch, label %finally.catchall
 
       catch:                                            ; preds = %lpad
@@ -342,13 +342,13 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) {
         unreachable
 
       lpad2:                                           ; preds = %invoke.cont2, %catch
-        %ex.1 = phi i8* [ undef, %invoke.cont2 ], [ null, %catch ]
-        %1 = landingpad { i8*, i32 }
-                catch i8* null
+        %ex.1 = phi ptr [ undef, %invoke.cont2 ], [ null, %catch ]
+        %1 = landingpad { ptr, i32 }
+                catch ptr null
         br label %finally.catchall
 
       finally.catchall:                                 ; preds = %lpad33, %lpad
-        %ex.2 = phi i8* [ %ex.1, %lpad2 ], [ null, %lpad ]
+        %ex.2 = phi ptr [ %ex.1, %lpad2 ], [ null, %lpad ]
         unreachable
     }
   )invalid", Err, Ctx));
@@ -384,7 +384,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) {
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
     declare i32 @bar()
 
-    define i32 @foo() personality i8* null {
+    define i32 @foo() personality ptr null {
     entry:
       %0 = invoke i32 @bar() to label %exit unwind label %lpad
 
@@ -392,9 +392,9 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) {
       ret i32 %0
 
     lpad:
-      %1 = landingpad { i8*, i32 }
+      %1 = landingpad { ptr, i32 }
               cleanup
-      resume { i8*, i32 } %1
+      resume { ptr, i32 } %1
     }
   )invalid",
                                                 Err, Ctx));
@@ -421,7 +421,7 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
         target triple = "aarch64"
 
         %b = type { i64 }
-        declare void @g(i8*)
+        declare void @g(ptr)
 
         declare void @llvm.assume(i1) #0
 
@@ -430,9 +430,9 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
           br label %label
 
         label:
-          %0 = load %b*, %b** inttoptr (i64 8 to %b**), align 8
-          %1 = getelementptr inbounds %b, %b* %0, i64 undef, i32 0
-          %2 = load i64, i64* %1, align 8
+          %0 = load ptr, ptr inttoptr (i64 8 to ptr), align 8
+          %1 = getelementptr inbounds %b, ptr %0, i64 undef, i32 0
+          %2 = load i64, ptr %1, align 8
           %3 = icmp ugt i64 %2, 1
           br i1 %3, label %if.then, label %if.else
 
@@ -440,8 +440,8 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) {
           unreachable
 
         if.else:
-          call void @g(i8* undef)
-          store i64 undef, i64* null, align 536870912
+          call void @g(ptr undef)
+          store i64 undef, ptr null, align 536870912
           %4 = icmp eq i64 %2, 0
           call void @llvm.assume(i1 %4)
           unreachable
@@ -473,9 +473,9 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) {
     target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
     target triple = "x86_64-unknown-linux-gnu"
 
-    declare void @use(i32*)
-    declare void @llvm.lifetime.start.p0i8(i64, i8*)
-    declare void @llvm.lifetime.end.p0i8(i64, i8*)
+    declare void @use(ptr)
+    declare void @llvm.lifetime.start.p0i8(i64, ptr)
+    declare void @llvm.lifetime.end.p0i8(i64, ptr)
 
     define void @foo() {
     entry:
@@ -483,14 +483,14 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) {
       br label %extract
 
     extract:
-      %1 = bitcast i32* %0 to i8*
-      call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
-      call void @use(i32* %0)
+      %1 = bitcast ptr %0 to ptr
+      call void @llvm.lifetime.start.p0i8(i64 4, ptr %1)
+      call void @use(ptr %0)
       br label %exit
 
     exit:
-      call void @use(i32* %0)
-      call void @llvm.lifetime.end.p0i8(i64 4, i8* %1)
+      call void @use(ptr %0)
+      call void @llvm.lifetime.end.p0i8(i64 4, ptr %1)
       ret void
     }
   )ir",
diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
index 9466977d00649..191ccc3a9dbd9 100644
--- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
@@ -75,21 +75,21 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentSimpleTest) {
   //     i = 3;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) {
                  entry:
                    br i1 %cond1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
                    br i1 %cond1, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
                    br i1 %cond2, label %if.third, label %if.third.end
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    ret void
@@ -136,51 +136,51 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentOppositeCondTest) {
   //     i = 9;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i32 %X, i32 %Y) {
+      parseIR(C, R"(define void @foo(ptr %i, i32 %X, i32 %Y) {
                  entry:
                    %cmp1 = icmp ult i32 %X, %Y
                    br i1 %cmp1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
                    %cmp2 = icmp ugt i32 %Y, %X
                    br i1 %cmp2, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
                    %cmp3 = icmp uge i32 %X, %Y
                    br i1 %cmp3, label %if.third, label %if.third.else
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.else:
-                   store i32 4, i32* %i, align 4
+                   store i32 4, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    %cmp4 = icmp eq i32 %X, %Y
                    br i1 %cmp4, label %if.fourth, label %if.fourth.end
                  if.fourth:
-                   store i32 5, i32* %i, align 4
+                   store i32 5, ptr %i, align 4
                    br label %if.fourth.end
                  if.fourth.end:
                    %cmp5 = icmp eq i32 %Y, %X
                    br i1 %cmp5, label %if.fifth, label %if.fifth.else
                  if.fifth:
-                   store i32 6, i32* %i, align 4
+                   store i32 6, ptr %i, align 4
                    br label %if.fifth.end
                  if.fifth.else:
-                   store i32 7, i32* %i, align 4
+                   store i32 7, ptr %i, align 4
                    br label %if.fifth.end
                  if.fifth.end:
                    %cmp6 = icmp ne i32 %X, %Y
                    br i1 %cmp6, label %if.sixth, label %if.sixth.else
                  if.sixth:
-                   store i32 8, i32* %i, align 4
+                   store i32 8, ptr %i, align 4
                    br label %if.sixth.end
                  if.sixth.else:
-                   store i32 9, i32* %i, align 4
+                   store i32 9, ptr %i, align 4
                    br label %if.sixth.end
                  if.sixth.end:
                    ret void
@@ -227,20 +227,20 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentCondNestTest) {
   //       i = 2;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) {
          entry:
            br i1 %cond1, label %if.outer.first, label %if.first.end
          if.outer.first:
            br i1 %cond2, label %if.inner.first, label %if.first.end
          if.inner.first:
-           store i32 1, i32* %i, align 4
+           store i32 1, ptr %i, align 4
            br label %if.first.end
          if.first.end:
            br i1 %cond2, label %if.outer.second, label %if.second.end
          if.outer.second:
            br i1 %cond1, label %if.inner.second, label %if.second.end
          if.inner.second:
-           store i32 2, i32* %i, align 4
+           store i32 2, ptr %i, align 4
            br label %if.second.end
          if.second.end:
            ret void
@@ -283,7 +283,7 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) {
   //     i = 4;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2, i1 %cond3) {
+      parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2, i1 %cond3) {
          entry:
            br i1 %cond1, label %if.outer.first, label %if.first.end
          if.outer.first:
@@ -291,26 +291,26 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) {
          if.middle.first:
            br i1 %cond3, label %if.inner.first, label %if.first.end
          if.inner.first:
-           store i32 1, i32* %i, align 4
+           store i32 1, ptr %i, align 4
            br label %if.first.end
          if.first.end:
            br i1 %cond2, label %if.outer.second, label %if.second.end
          if.outer.second:
            br i1 %cond3, label %if.inner.second, label %if.second.end
          if.inner.second:
-           store i32 2, i32* %i, align 4
+           store i32 2, ptr %i, align 4
            br label %if.second.end
          if.second.end:
            br i1 %cond1, label %if.outer.third, label %if.third.end
          if.outer.third:
            br i1 %cond1, label %if.inner.third, label %if.third.end
          if.inner.third:
-           store i32 3, i32* %i, align 4
+           store i32 3, ptr %i, align 4
            br label %if.third.end
          if.third.end:
            br i1 %cond1, label %if.fourth, label %if.fourth.end
          if.fourth:
-           store i32 4, i32* %i, align 4
+           store i32 4, ptr %i, align 4
            br label %if.fourth.end
          if.fourth.end:
            ret void
@@ -343,28 +343,28 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentPointerTest) {
   //     i = 3;
   // }
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @foo(i32* %i, i32* %cond) {
+      parseIR(C, R"(define void @foo(ptr %i, ptr %cond) {
                  entry:
-                   %0 = load i32, i32* %cond, align 4
+                   %0 = load i32, ptr %cond, align 4
                    %tobool1 = icmp ne i32 %0, 0
                    br i1 %tobool1, label %if.first, label %if.first.end
                  if.first:
-                   store i32 1, i32* %i, align 4
+                   store i32 1, ptr %i, align 4
                    br label %if.first.end
                  if.first.end:
-                   %1 = load i32, i32* %cond, align 4
+                   %1 = load i32, ptr %cond, align 4
                    %tobool2 = icmp ne i32 %1, 0
                    br i1 %tobool2, label %if.second, label %if.second.end
                  if.second:
-                   store i32 2, i32* %i, align 4
+                   store i32 2, ptr %i, align 4
                    br label %if.second.end
                  if.second.end:
-                   store i32 1, i32* %cond, align 4
-                   %2 = load i32, i32* %cond, align 4
+                   store i32 1, ptr %cond, align 4
+                   %2 = load i32, ptr %cond, align 4
                    %tobool3 = icmp ne i32 %2, 0
                    br i1 %tobool3, label %if.third, label %if.third.end
                  if.third:
-                   store i32 3, i32* %i, align 4
+                   store i32 3, ptr %i, align 4
                    br label %if.third.end
                  if.third.end:
                    ret void
@@ -450,7 +450,7 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) {
   //   }
   // }
   std::unique_ptr<Module> M = parseIR(
-      C, R"(define void @foo(i32* noalias %A, i32* noalias %B, i32* noalias %C
+      C, R"(define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias %C
                            , i64 %N) {
          entry:
            %X = sdiv i64 1, %N
@@ -461,18 +461,18 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) {
            br i1 %cmp1, label %for.body, label %for.end
          for.body:
            %i = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-           %arrayidx_A5 = getelementptr inbounds i32, i32* %A, i64 5
-           store i32 5, i32* %arrayidx_A5, align 4
-           %arrayidx_A = getelementptr inbounds i32, i32* %A, i64 %i
-           store i32 0, i32* %arrayidx_A, align 4
-           %load1 = load i32, i32* %arrayidx_A, align 4
-           %arrayidx_B = getelementptr inbounds i32, i32* %B, i64 %i
-           store i32 %load1, i32* %arrayidx_B, align 4
-           %load2 = load i32, i32* %arrayidx_A, align 4
-           %arrayidx_C = getelementptr inbounds i32, i32* %C, i64 %i
-           store i32 %load2, i32* %arrayidx_C, align 4
-           %arrayidx_A6 = getelementptr inbounds i32, i32* %A, i64 6
-           store i32 6, i32* %arrayidx_A6, align 4
+           %arrayidx_A5 = getelementptr inbounds i32, ptr %A, i64 5
+           store i32 5, ptr %arrayidx_A5, align 4
+           %arrayidx_A = getelementptr inbounds i32, ptr %A, i64 %i
+           store i32 0, ptr %arrayidx_A, align 4
+           %load1 = load i32, ptr %arrayidx_A, align 4
+           %arrayidx_B = getelementptr inbounds i32, ptr %B, i64 %i
+           store i32 %load1, ptr %arrayidx_B, align 4
+           %load2 = load i32, ptr %arrayidx_A, align 4
+           %arrayidx_C = getelementptr inbounds i32, ptr %C, i64 %i
+           store i32 %load2, ptr %arrayidx_C, align 4
+           %arrayidx_A6 = getelementptr inbounds i32, ptr %A, i64 6
+           store i32 6, ptr %arrayidx_A6, align 4
            %inc = add nsw i64 %i, 1
            %cmp = icmp slt i64 %inc, %N
            br i1 %cmp, label %for.body, label %for.end
@@ -686,19 +686,19 @@ TEST(CodeMoverUtils, IsSafeToMoveTest5) {
   LLVMContext C;
 
   std::unique_ptr<Module> M =
-      parseIR(C, R"(define void @dependence(i32* noalias %A, i32* noalias %B){
+      parseIR(C, R"(define void @dependence(ptr noalias %A, ptr noalias %B){
 entry:
-    store i32 0, i32* %A, align 4 ; storeA0
-    store i32 2, i32* %A, align 4 ; storeA1
-    %tmp0 = load i32, i32* %A, align 4 ; loadA0
-    store i32 1, i32* %B, align 4 ; storeB0
-    %tmp1 = load i32, i32* %A, align 4 ; loadA1
-    store i32 2, i32* %A, align 4 ; storeA2 
-    store i32 4, i32* %B, align 4 ; StoreB1 
-    %tmp2 = load i32, i32* %A, align 4 ; loadA2 
-    %tmp3 = load i32, i32* %A, align 4 ; loadA3 
-    %tmp4 = load i32, i32* %B, align 4 ; loadB2
-    %tmp5 = load i32, i32* %B, align 4 ; loadB3
+    store i32 0, ptr %A, align 4 ; storeA0
+    store i32 2, ptr %A, align 4 ; storeA1
+    %tmp0 = load i32, ptr %A, align 4 ; loadA0
+    store i32 1, ptr %B, align 4 ; storeB0
+    %tmp1 = load i32, ptr %A, align 4 ; loadA1
+    store i32 2, ptr %A, align 4 ; storeA2
+    store i32 4, ptr %B, align 4 ; StoreB1
+    %tmp2 = load i32, ptr %A, align 4 ; loadA2
+    %tmp3 = load i32, ptr %A, align 4 ; loadA3
+    %tmp4 = load i32, ptr %B, align 4 ; loadB2
+    %tmp5 = load i32, ptr %B, align 4 ; loadB3
     ret void
 })");
 
@@ -763,63 +763,63 @@ TEST(CodeMoverUtils, IsSafeToMoveTest6) {
   LLVMContext C;
 
   std::unique_ptr<Module> M = parseIR(
-      C, R"(define void @dependence(i1 %cond, i32* noalias %A, i32* noalias %B){
+      C, R"(define void @dependence(i1 %cond, ptr noalias %A, ptr noalias %B){
    entry:
         br i1 %cond, label %bb0, label %bb1
    bb0:
         br label %bb1
     bb1:
-        store i32 0, i32* %A, align 4 ; storeA0
+        store i32 0, ptr %A, align 4 ; storeA0
         br i1 %cond, label %bb2, label %bb3
     bb2:
         br label %bb3
     bb3:
-        store i32 2, i32* %A, align 4 ; storeA1
+        store i32 2, ptr %A, align 4 ; storeA1
         br i1 %cond, label %bb4, label %bb5
     bb4:
         br label %bb5
     bb5:
-        %tmp0 = load i32, i32* %A, align 4 ; loadA0
+        %tmp0 = load i32, ptr %A, align 4 ; loadA0
         br i1 %cond, label %bb6, label %bb7
     bb6:
         br label %bb7
     bb7:
-        store i32 1, i32* %B, align 4 ; storeB0
+        store i32 1, ptr %B, align 4 ; storeB0
         br i1 %cond, label %bb8, label %bb9
     bb8:
         br label %bb9
     bb9:
-        %tmp1 = load i32, i32* %A, align 4 ; loadA1
+        %tmp1 = load i32, ptr %A, align 4 ; loadA1
         br i1 %cond, label %bb10, label %bb11
     bb10:
         br label %bb11
     bb11:
-        store i32 2, i32* %A, align 4 ; storeA2
+        store i32 2, ptr %A, align 4 ; storeA2
         br i1 %cond, label %bb12, label %bb13
     bb12:
         br label %bb13
     bb13:
-        store i32 4, i32* %B, align 4 ; StoreB1
+        store i32 4, ptr %B, align 4 ; StoreB1
         br i1 %cond, label %bb14, label %bb15
     bb14:
         br label %bb15
     bb15:
-        %tmp2 = load i32, i32* %A, align 4 ; loadA2
+        %tmp2 = load i32, ptr %A, align 4 ; loadA2
         br i1 %cond, label %bb16, label %bb17
     bb16:
         br label %bb17
     bb17:
-        %tmp3 = load i32, i32* %A, align 4 ; loadA3
+        %tmp3 = load i32, ptr %A, align 4 ; loadA3
         br i1 %cond, label %bb18, label %bb19
     bb18:
         br label %bb19
     bb19:
-        %tmp4 = load i32, i32* %B, align 4 ; loadB2
+        %tmp4 = load i32, ptr %B, align 4 ; loadB2
         br i1 %cond, label %bb20, label %bb21
     bb20:
        br label %bb21
     bb21:
-       %tmp5 = load i32, i32* %B, align 4 ; loadB3
+       %tmp5 = load i32, ptr %B, align 4 ; loadB3
        ret void
    })");
   run(*M, "dependence",
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 4908eda16e002..c37ed5d8613b0 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -183,7 +183,7 @@ TEST(Local, MergeBasicBlockIntoOnlyPred) {
   auto resetIR = [&]() {
     M = parseIR(C,
                 R"(
-      define i32 @f(i8* %str) {
+      define i32 @f(ptr %str) {
       entry:
         br label %bb2.i
       bb2.i:                                            ; preds = %bb4.i, %entry
@@ -411,7 +411,7 @@ TEST(Local, ConstantFoldTerminator) {
 
       define void @indirectbr() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1]
+        indirectbr ptr blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1]
       bb0:
         ret void
       bb1:
@@ -420,14 +420,14 @@ TEST(Local, ConstantFoldTerminator) {
 
       define void @indirectbr_repeated() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0]
+        indirectbr ptr blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0]
       bb0:
         ret void
       }
 
       define void @indirectbr_unreachable() {
       entry:
-        indirectbr i8* blockaddress(@indirectbr_unreachable, %bb0), [label %bb1]
+        indirectbr ptr blockaddress(@indirectbr_unreachable, %bb0), [label %bb1]
       bb0:
         ret void
       bb1:
@@ -925,7 +925,7 @@ TEST(Local, RemoveUnreachableBlocks) {
 
       declare i32 @__gxx_personality_v0(...)
 
-      define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+      define void @invoke_terminator() personality ptr @__gxx_personality_v0 {
       entry:
         br i1 undef, label %invoke.block, label %exit
 
@@ -943,8 +943,8 @@ TEST(Local, RemoveUnreachableBlocks) {
         unreachable
 
       lpad.block:
-        %lp = landingpad { i8*, i32 }
-                catch i8* null
+        %lp = landingpad { ptr, i32 }
+                catch ptr null
         br label %exit
 
       exit:
diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
index b97bc311f4655..dd03b4f2ae971 100644
--- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
+++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp
@@ -98,13 +98,13 @@ struct MemTransferLowerTest : public testing::Test {
 // For that reason expandMemCpyAsLoop is expected to  explicitly mark
 // loads from source and stores to destination as not aliasing.
 TEST_F(MemTransferLowerTest, MemCpyKnownLength) {
-  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n"
-                "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n"
+  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i8* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
-                "  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, "
+                "  call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, "
                 "i64 1024, i1 false)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -138,13 +138,13 @@ TEST_F(MemTransferLowerTest, MemCpyKnownLength) {
 // llvm.memcpy lowering) doesn't alias by making sure the loop can be
 // successfully vectorized without additional runtime checks.
 TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) {
-  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n"
-                "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n"
+  ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i8* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
-                "  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, "
+                "  call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, "
                 "i64 1024, i1 false)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -176,16 +176,16 @@ TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) {
 
 TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) {
   ParseAssembly("declare void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, "
                 "i32 *, i64, i32)\n"
-                "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i32* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
                 "  call void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* "
-                "%dst, i32* %src, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr "
+                "%dst, ptr %src, "
                 "i64 1024, i32 4)\n"
                 "  br label %exit\n"
                 "exit:\n"
@@ -221,16 +221,16 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) {
 
 TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) {
   ParseAssembly("declare void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, "
                 "i32 *, i64, i32)\n"
-                "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n"
+                "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n"
                 "entry:\n"
-                "  %is_not_equal = icmp ne i32* %dst, %src\n"
+                "  %is_not_equal = icmp ne ptr %dst, %src\n"
                 "  br i1 %is_not_equal, label %memcpy, label %exit\n"
                 "memcpy:\n"
                 "  call void "
-                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* "
-                "%dst, i32* %src, "
+                "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr "
+                "%dst, ptr %src, "
                 "i64 %n, i32 4)\n"
                 "  br label %exit\n"
                 "exit:\n"
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index 55eae64fe0f6d..4fe30805f9499 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -121,18 +121,18 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
 TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) {
   /*
    * Create the following code:
-   * func(i64 addrspace(10)* %arg)
+   * func(ptr addrspace(10) %arg)
    * top:
    *  br label %L.ph
    * L.ph:
-   *  %gepbase = getelementptr i64 addrspace(10)* %arg, i64 1
+   *  %gepbase = getelementptr ptr addrspace(10) %arg, i64 1
    *  br label %L
    * L:
    *  %phi = phi i64 [i64 0, %L.ph], [ %add, %L2 ]
    *  %add = add i64 %phi2, 1
    *  br i1 undef, label %post, label %L2
    * post:
-   *  #= %gep = getelementptr i64 addrspace(10)* %gepbase, i64 %add =#
+   *  #= %gep = getelementptr ptr addrspace(10) %gepbase, i64 %add =#
    *  ret void
    *
    * We will create the appropriate SCEV expression for %gep and expand it,
@@ -199,7 +199,7 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) {
 TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderIsSafeToExpandAt) {
   /*
    * Create the following code:
-   * func(i64 addrspace(10)* %arg)
+   * func(ptr addrspace(10) %arg)
    * top:
    *  br label %L.ph
    * L.ph:
@@ -704,14 +704,14 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderShlNSW) {
     EXPECT_FALSE(I->hasNoSignedWrap());
   };
 
-  checkOneCase("define void @f(i16* %arrayidx) { "
-               "  %1 = load i16, i16* %arrayidx "
+  checkOneCase("define void @f(ptr %arrayidx) { "
+               "  %1 = load i16, ptr %arrayidx "
                "  %2 = and i16 %1, -32768 "
                "  ret void "
                "} ");
 
-  checkOneCase("define void @f(i8* %arrayidx) { "
-               "  %1 = load i8, i8* %arrayidx "
+  checkOneCase("define void @f(ptr %arrayidx) { "
+               "  %1 = load i8, ptr %arrayidx "
                "  %2 = and i8 %1, -128 "
                "  ret void "
                "} ");
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index eec10110e6af4..7ba259deb574c 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -34,7 +34,7 @@ TEST(LoopUnrollRuntime, Latch) {
 
   std::unique_ptr<Module> M = parseIR(
     C,
-    R"(define i32 @test(i32* %a, i32* %b, i32* %c, i64 %n) {
+    R"(define i32 @test(ptr %a, ptr %b, ptr %c, i64 %n) {
 entry:
   br label %while.cond
 
@@ -44,13 +44,13 @@ while.cond:                                       ; preds = %while.body, %entry
   br i1 %cmp, label %while.body, label %while.end
 
 while.body:                                       ; preds = %while.cond
-  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.0
-  %0 = load i32, i32* %arrayidx
-  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.0
-  %1 = load i32, i32* %arrayidx1
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.0
+  %0 = load i32, ptr %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, ptr %c, i64 %i.0
+  %1 = load i32, ptr %arrayidx1
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.0
-  store i32 %mul, i32* %arrayidx2
+  %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %i.0
+  store i32 %mul, ptr %arrayidx2
   %inc = add nsw i64 %i.0, 1
   br label %while.cond
 
diff --git a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
index e39cd7038b280..7f12deae2ad1b 100644
--- a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp
@@ -74,7 +74,7 @@ TEST(ValueMapperTest, mapMDNodeDuplicatedCycle) {
 
   // Create a cycle that references G0.
   MDNode *N0; // !0 = !{!1}
-  MDNode *N1; // !1 = !{!0, i8* @G0}
+  MDNode *N1; // !1 = !{!0, ptr @G0}
   {
     auto T0 = MDTuple::getTemporary(Context, nullptr);
     Metadata *Ops1[] = {T0.get(), ConstantAsMetadata::get(G0.get())};

From 19d472f06efe4c744ef499d8bf3db740a2b9e2fb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 18:52:45 -0800
Subject: [PATCH 55/97] AArch64: Convert tests to opaque pointers (#167442)

---
 .../aarch64-reassociate-accumulators.ll       |  56 +++++-----
 .../CodeGen/AArch64/cgdata-merge-local.ll     |  12 +--
 .../CodeGen/AArch64/cgdata-merge-no-params.ll |  12 +--
 .../AArch64/cgdata-no-merge-unnamed.ll        |  12 +--
 llvm/test/CodeGen/AArch64/divrem.ll           |   6 +-
 .../AArch64/ldp-stp-scaled-unscaled-pairs.ll  |   2 +-
 .../AArch64/machine-outliner-iterative.mir    |   8 +-
 .../CodeGen/AArch64/misched-fusion-cmp-bcc.ll |   4 +-
 llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll  |   2 +-
 llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll |  28 ++---
 .../CodeGen/AArch64/ptrauth-call-rv-marker.ll | 102 +++++++++---------
 llvm/test/CodeGen/AArch64/ptrauth-reloc.ll    |   6 +-
 12 files changed, 125 insertions(+), 125 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
index a84d666c1be6b..d1bcad4724e48 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll
@@ -24,8 +24,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vabd_ext = zext <8 x i8> %vabd to <8 x i16>
   %acc_next = add <8 x i16> %vabd_ext, %acc_phi
@@ -65,8 +65,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmov = zext <4 x i16> %vabd to <4 x i32>
   %acc_next = add <4 x i32> %vmov, %acc_phi
@@ -116,8 +116,8 @@ loop:
   %acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
-  %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
+  %a = load <16 x i8>, ptr %ptr1_i, align 1
+  %b = load <16 x i8>, ptr %ptr2_i, align 1
   %a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -160,8 +160,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
-  %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
+  %a = load <4 x i32>, ptr %ptr1_i, align 1
+  %b = load <4 x i32>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b)
   %acc_next = add <4 x i32> %acc_phi, %vabd
   %next_i = add i32 %i, 4
@@ -198,8 +198,8 @@ loop:
   ; Load values from ptr1 and ptr2
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1
-  %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1
+  %a = load <4 x i32>, ptr %ptr1_i, align 1
+  %b = load <4 x i32>, ptr %ptr2_i, align 1
   ; Perform the intrinsic operation
   %vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b)
   %acc_next = add <4 x i32> %acc_phi, %vabd
@@ -237,8 +237,8 @@ loop:
   %acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i
-  %a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1
-  %b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1
+  %a = load <2 x i32>, ptr %ptr1_i, align 1
+  %b = load <2 x i32>, ptr %ptr2_i, align 1
   %vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %acc_next = add <2 x i32> %acc_phi, %vabd
   %next_i = add i32 %i, 2
@@ -272,8 +272,8 @@ loop:
   %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %acc_next = add <8 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -307,8 +307,8 @@ loop:
   %acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1
-  %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1
+  %a = load <16 x i8>, ptr %ptr1_i, align 1
+  %b = load <16 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b)
   %acc_next = add <16 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 16
@@ -342,8 +342,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
-  %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
+  %a = load <8 x i16>, ptr %ptr1_i, align 1
+  %b = load <8 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b)
   %acc_next = add <8 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -377,8 +377,8 @@ loop:
   %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %acc_next = add <8 x i8> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -411,8 +411,8 @@ loop:
   %acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %acc_next = add <4 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 4
@@ -445,8 +445,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1
-  %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1
+  %a = load <8 x i16>, ptr %ptr1_i, align 1
+  %b = load <8 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b)
   %acc_next = add <8 x i16> %acc_phi, %vabd
   %next_i = add i32 %i, 8
@@ -480,8 +480,8 @@ loop:
   %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i
-  %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1
-  %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1
+  %a = load <8 x i8>, ptr %ptr1_i, align 1
+  %b = load <8 x i8>, ptr %ptr2_i, align 1
   %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmov = zext <8 x i8> %vabd to <8 x i16>
   %acc_next = add <8 x i16> %vmov, %acc_phi
@@ -516,8 +516,8 @@ loop:
   %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]
   %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i
   %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i
-  %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1
-  %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1
+  %a = load <4 x i16>, ptr %ptr1_i, align 1
+  %b = load <4 x i16>, ptr %ptr2_i, align 1
   %vabd = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmov = zext <4 x i16> %vabd to <4 x i32>
   %acc_next = add <4 x i32> %vmov, %acc_phi
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
index 608fe29e17398..d421b3f17caf8 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll
@@ -54,9 +54,9 @@
 define i32 @f1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -65,9 +65,9 @@ entry:
 define i32 @f2(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g2, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g2, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
index 10f0e10f11d66..a9da1253de01d 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll
@@ -19,9 +19,9 @@
 define i32 @f1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -30,9 +30,9 @@ entry:
 define i32 @f2(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
index 9986af7eb231c..7ab2aba8d75e2 100644
--- a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
+++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll
@@ -12,9 +12,9 @@
 define i32 @0(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g1, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g1, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
@@ -23,9 +23,9 @@ entry:
 define i32 @1(i32 %a) {
 entry:
   %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
-  %0 = load i32, i32* %arrayidx, align 4
-  %1 = load volatile i32, i32* @g2, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = load volatile i32, ptr @g2, align 4
   %mul = mul nsw i32 %1, %0
   %add = add nsw i32 %mul, 1
   ret i32 %add
diff --git a/llvm/test/CodeGen/AArch64/divrem.ll b/llvm/test/CodeGen/AArch64/divrem.ll
index 5cd7e098d00bb..e3cbd17dc4c3f 100644
--- a/llvm/test/CodeGen/AArch64/divrem.ll
+++ b/llvm/test/CodeGen/AArch64/divrem.ll
@@ -2,7 +2,7 @@
 
 ; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
 ; should not generate select error.
-define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, ptr %z) {
 ; CHECK-LABEL: test_udivrem
 ; CHECK-DAG: udivrem
 ; CHECK-NOT: LLVM ERROR: Cannot select
@@ -12,10 +12,10 @@ define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
   ret <2 x i32> %1
 }
 
-define <4 x i32> @test_sdivrem(<4 x i32> %x,  ptr %y) {
+define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: test_sdivrem
 ; CHECK-DAG: sdivrem
-  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
   store <4 x i32> %div, ptr %y
   %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
index 91cf605613b9e..c0c8894ce1f6b 100644
--- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll
@@ -85,7 +85,7 @@ define i64 @test_ldrsw_ldursw(ptr %p) #0 {
 ; CHECK-NEXT: add.2d v0, v[[V0]], v[[V1]]
 ; CHECK-NEXT: ret
 define <2 x i64> @test_ldrq_ldruq_invalidoffset(ptr %p) #0 {
-  %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+  %tmp1 = load <2 x i64>, ptr %p, align 8
   %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 3
   %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8
   %add = add nsw <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
index b7fbdc09c1dd1..a635231fef7fb 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir
@@ -6,9 +6,9 @@
 #
 #; define void @"$s12"(...) {         define i64 @"$s5” (...) {             define void @"$s13"(...) {
 #   ...                                ...                                   ...
-#   %8 = load i1, i1* %7                                                     %8 = load i1, i1* %7
-#   %9 = load i4, i4*, %6              %9 = load i4, i4*, %6                 %9 = load i4, i4*, %6
-#   store i4 %9, i4* %5                store i4 %9, i4* %5                   store i4 %9, i4* %5
+#   %8 = load i1, ptr %7                                                     %8 = load i1, ptr %7
+#   %9 = load i4, ptr, %6              %9 = load i4, ptr, %6                 %9 = load i4, ptr, %6
+#   store i4 %9, ptr %5                store i4 %9, ptr %5                   store i4 %9, ptr %5
 #   ...                                ...                                   ...
 # }                                  }                                     }
 #
@@ -16,7 +16,7 @@
 #
 # define void @"$s12"(...) {         define i64 @"$s5” (...) {             define void @"$s13"(...) {
 #   ...                                ...                                   ...
-#   %8 = load i1, i1* %7                                                     %8 = load i1, i1* %7
+#   %8 = load i1, ptr %7                                                     %8 = load i1, ptr %7
 #   call void @outlined_function_1_1   call void @outlined_function_1_1      call void @outlined_function_1_1
 #   ...                                ...                                   ...
 # }                                  }                                     }
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
index 700a060ef968f..0a10e80d998cd 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll
@@ -15,10 +15,10 @@
 ; RUN: llc %s -o - -O0 -mtriple=aarch64-unknown -mcpu=ampere1b      | FileCheck %s
 
 
-define void @test_cmp_bcc_fusion(i32 %x, i32 %y, i32* %arr) {
+define void @test_cmp_bcc_fusion(i32 %x, i32 %y, ptr %arr) {
 entry:
   %cmp = icmp eq i32 %x, %y
-  store i32 %x, i32* %arr, align 4
+  store i32 %x, ptr %arr, align 4
   br i1 %cmp, label %if_true, label %if_false
 
 if_true:
diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
index b7dde881291bb..1a85f803b9e57 100644
--- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -19,7 +19,7 @@ define void @test_nopair_st(ptr %ptr, <2 x double> %v1, <2 x double> %v2) {
 ; SLOW-NOT: ldp
 ; FAST: ldp
 define <2 x i64> @test_nopair_ld(ptr %p) {
-  %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+  %tmp1 = load <2 x i64>, ptr %p, align 8
   %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
   %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8
   %add = add nsw <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
index 0356a46ec1050..df5e1a9f1ee10 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  braaz x16
-define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ia_0(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 0) ]
   ret i32 %tmp0
 }
@@ -26,7 +26,7 @@ define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  brabz x16
-define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ib_0(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 0) ]
   ret i32 %tmp0
 }
@@ -36,7 +36,7 @@ define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  mov x17, #42
 ; CHECK-NEXT:  braa x16, x17
-define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ia_imm(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 42) ]
   ret i32 %tmp0
 }
@@ -46,7 +46,7 @@ define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 {
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  mov x17, #42
 ; CHECK-NEXT:  brab x16, x17
-define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 {
+define i32 @test_tailcall_ib_imm(ptr %arg0) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 42) ]
   ret i32 %tmp0
 }
@@ -60,8 +60,8 @@ define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 {
 ; ELF-NEXT:    ldr x1, [x1]
 ; ELF-NEXT:    mov x16, x0
 ; ELF-NEXT:    braa x16, x1
-define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 {
-  %tmp0 = load i64, i64* %arg1
+define i32 @test_tailcall_ia_var(ptr %arg0, ptr %arg1) #0 {
+  %tmp0 = load i64, ptr %arg1
   %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %tmp0) ]
   ret i32 %tmp1
 }
@@ -75,8 +75,8 @@ define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 {
 ; ELF-NEXT:    ldr x1, [x1]
 ; ELF-NEXT:    mov x16, x0
 ; ELF-NEXT:    brab x16, x1
-define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 {
-  %tmp0 = load i64, i64* %arg1
+define i32 @test_tailcall_ib_var(ptr %arg0, ptr %arg1) #0 {
+  %tmp0 = load i64, ptr %arg1
   %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %tmp0) ]
   ret i32 %tmp1
 }
@@ -85,7 +85,7 @@ define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  braa x16, x1
-define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 {
+define i32 @test_tailcall_ia_arg(ptr %arg0, i64 %arg1) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %arg1) ]
   ret i32 %tmp0
 }
@@ -94,7 +94,7 @@ define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  mov x16, x0
 ; CHECK-NEXT:  brab x16, x1
-define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 {
+define i32 @test_tailcall_ib_arg(ptr %arg0, i64 %arg1) #0 {
   %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %arg1) ]
   ret i32 %tmp0
 }
@@ -103,8 +103,8 @@ define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  ldr x16, [x0]
 ; CHECK-NEXT:  braa x16, x1
-define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
-  %tmp0 = load i32 ()*, i32 ()** %arg0
+define i32 @test_tailcall_ia_arg_ind(ptr %arg0, i64 %arg1) #0 {
+  %tmp0 = load ptr, ptr %arg0
   %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 0, i64 %arg1) ]
   ret i32 %tmp1
 }
@@ -113,8 +113,8 @@ define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
 ; CHECK-NEXT:  bti c
 ; CHECK-NEXT:  ldr x16, [x0]
 ; CHECK-NEXT:  brab x16, x1
-define i32 @test_tailcall_ib_arg_ind(i32 ()** %arg0, i64 %arg1) #0 {
-  %tmp0 = load i32 ()*, i32 ()** %arg0
+define i32 @test_tailcall_ib_arg_ind(ptr %arg0, i64 %arg1) #0 {
+  %tmp0 = load ptr, ptr %arg0
   %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 1, i64 %arg1) ]
   ret i32 %tmp1
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
index 9cf77b125e107..950db5fd6381f 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll
@@ -4,18 +4,18 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "arm64e-apple-iphoneos"
 
-declare i8* @foo0(i32)
-declare i8* @foo1()
+declare ptr @foo0(i32)
+declare ptr @foo1()
 
-declare void @llvm.objc.release(i8*)
-declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
-declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare void @llvm.objc.release(ptr)
+declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr)
+declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr)
 
-declare void @foo2(i8*)
+declare void @foo2(ptr)
 
 declare void @foo(i64, i64, i64)
 
-define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blraa(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraa [[ADDR]], x1
@@ -23,14 +23,14 @@ define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) {
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blraa_unsafeClaim(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_unsafeClaim
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraa [[ADDR]], x1
@@ -38,14 +38,14 @@ define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) {
 ; CHECK-NEXT:    bl objc_unsafeClaimAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blraa_disc_imm16(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_disc_imm16
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    mov x17, #45431
@@ -53,14 +53,14 @@ define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) {
 ; CHECK-NEXT:    mov x29, x29
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) {
+define void @rv_marker_ptrauth_blraa_multiarg(ptr %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraa_multiarg
 ; CHECK:         mov  [[TMP:x[0-9]+]], x1
 ; CHECK-DAG:     ldr [[ADDR:x[0-9]+]]
@@ -71,28 +71,28 @@ define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
 entry:
-  %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0
-  %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrab(i8* ()** %arg0, i64 %arg1) {
+define void @rv_marker_ptrauth_blrab(ptr %arg0, i64 %arg1) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrab
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blrab [[ADDR]], x1
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blrab_disc_imm16(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrab_disc_imm16
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    mov x17, #256
@@ -100,42 +100,42 @@ define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) {
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blraaz(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blraaz(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blraaz
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blraaz [[ADDR]]
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrabz(i8* ()** %arg0) {
+define void @rv_marker_ptrauth_blrabz(ptr %arg0) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrabz
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blrabz [[ADDR]]
 ; CHECK-NEXT:   mov x29, x29
 ; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* ()*, i8* ()** %arg0
-  %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
 
-define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 %a, i64 %b, i64 %c) {
+define void @rv_marker_ptrauth_blrabz_multiarg(ptr %arg0, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: rv_marker_ptrauth_blrabz_multiarg
 ; CHECK:         mov  [[TMP:x[0-9]+]], x1
 ; CHECK-DAG:     ldr [[ADDR:x[0-9]+]], [
@@ -146,9 +146,9 @@ define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64
 ; CHECK-NEXT:    mov x29, x29
 ; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ;
-  %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0
-  %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
-  tail call void @foo2(i8* %call0)
-  tail call void @llvm.objc.release(i8* %call0)
+  %tmp0 = load ptr, ptr %arg0
+  %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ]
+  tail call void @foo2(ptr %call0)
+  tail call void @llvm.objc.release(ptr %call0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
index 932cc946db0ea..02c643f101913 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
@@ -87,7 +87,7 @@
 ; CHECK-MACHO-NEXT:  _g.offset.ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
 
-@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2)
+@g.offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2)
 
 ; CHECK-ELF-LABEL:     .globl g.big_offset.ref.da.0
 ; CHECK-ELF-NEXT:      .p2align 3
@@ -99,7 +99,7 @@
 ; CHECK-MACHO-NEXT:  _g.big_offset.ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+2147549185)@AUTH(da,0)
 
-@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
+@g.big_offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
 
 ; CHECK-ELF-LABEL:     .globl g.weird_ref.da.0
 ; CHECK-ELF-NEXT:      .p2align 3
@@ -111,7 +111,7 @@
 ; CHECK-MACHO-NEXT:  _g.weird_ref.da.0:
 ; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
 
-@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64)
+@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64)
 
 ; CHECK-ELF-LABEL:     .globl g_weak.ref.ia.42
 ; CHECK-ELF-NEXT:      .p2align 3

From 3618ed14e4b0461f6a060db04c4a06db402861cd Mon Sep 17 00:00:00 2001
From: guan jian <guanjian@stu.cdut.edu.cn>
Date: Tue, 11 Nov 2025 11:03:34 +0800
Subject: [PATCH 56/97] [DAGCombiner] Add sra-xor-sra pattern fold (#166777)

Add `fold (sra (xor (sra x, c1), -1), c2) -> (sra (xor x, -1), c3)`

The IR like this:
```
  %a = ashr i8 %x, 6
  %n = xor i8 %a, -1
  %s = sext i8 %n to i16
  %r = and i16 %s, %y
  ret i16 %r
```

llvm will produce:
```
	slli	a0, a0, 56
	srai	a0, a0, 56
	not	a0, a0
	srai	a0, a0, 6
	and	a0, a0, a1
	ret
```

56 and 6 can be add up

alive2: https://alive2.llvm.org/ce/z/yxRQf9

---------

Co-authored-by: rez5427 <785369607@qq.com>
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++++
 llvm/test/CodeGen/RISCV/sra-xor-sra.ll        | 32 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/sra-xor-sra.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f2eb1e64dbe0..df353c4d91b1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10988,6 +10988,22 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     }
   }
 
+  // fold (sra (xor (sra x, c1), -1), c2) -> (xor (sra x, c3), -1)
+  // This allows merging two arithmetic shifts even when there's a NOT in
+  // between.
+  SDValue X;
+  APInt C1;
+  if (N1C && sd_match(N0, m_OneUse(m_Not(
+                              m_OneUse(m_Sra(m_Value(X), m_ConstInt(C1))))))) {
+    APInt C2 = N1C->getAPIntValue();
+    zeroExtendToMatch(C1, C2, 1 /* Overflow Bit */);
+    APInt Sum = C1 + C2;
+    unsigned ShiftSum = Sum.getLimitedValue(OpSizeInBits - 1);
+    SDValue NewShift = DAG.getNode(
+        ISD::SRA, DL, VT, X, DAG.getShiftAmountConstant(ShiftSum, VT, DL));
+    return DAG.getNOT(DL, NewShift, VT);
+  }
+
   // fold (sra (shl X, m), (sub result_size, n))
   // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
   // result_size - n != m.
diff --git a/llvm/test/CodeGen/RISCV/sra-xor-sra.ll b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll
new file mode 100644
index 0000000000000..b04f0a29d07f3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
+
+; Test folding of: (sra (xor (sra x, c1), -1), c2) -> (sra (xor x, -1), c3)
+; Original motivating example: should merge sra+sra across xor
+define i16 @not_invert_signbit_splat_mask(i8 %x, i16 %y) {
+; CHECK-LABEL: not_invert_signbit_splat_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = ashr i8 %x, 6
+  %n = xor i8 %a, -1
+  %s = sext i8 %n to i16
+  %r = and i16 %s, %y
+  ret i16 %r
+}
+
+; Edge case
+define i16 @sra_xor_sra_overflow(i8 %x, i16 %y) {
+; CHECK-LABEL: sra_xor_sra_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+  %a = ashr i8 %x, 10
+  %n = xor i8 %a, -1
+  %s = sext i8 %n to i16
+  %r = and i16 %s, %y
+  ret i16 %r
+}

From ae5036681caf3d613dfbbdf56af0f8a22709e1b9 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Tue, 11 Nov 2025 12:33:17 +0900
Subject: [PATCH 57/97] [OMPIRBuilder] Use AS 0 for internal variables for
 AMDGPU (#167377)

We see some libomptarget test failures if we use the default global AS.

See https://github.com/llvm/llvm-project/pull/166459 for more info.

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 clang/test/OpenMP/force-usm.c             |  2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
index 45c0e28b525da..5c63a9a5e7004 100644
--- a/clang/test/OpenMP/force-usm.c
+++ b/clang/test/OpenMP/force-usm.c
@@ -46,7 +46,7 @@ int main(void) {
 // CHECK-USM-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-USM:       user_code.entry:
 // CHECK-USM-NEXT:    store i32 1, ptr [[TMP0]], align 4
-// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(1) @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
 // CHECK-USM-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK-USM-NEXT:    store i32 2, ptr [[TMP3]], align 4
 // CHECK-USM-NEXT:    call void @__kmpc_target_deinit()
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 18a4f0a08cb5f..ac86fa859967e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -8482,8 +8482,15 @@ GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
     // create different versions of the function for different OMP internal
     // variables.
     const DataLayout &DL = M.getDataLayout();
-    unsigned AddressSpaceVal =
-        AddressSpace ? *AddressSpace : DL.getDefaultGlobalsAddressSpace();
+    // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
+    // default global AS is 1.
+    // See double-target-call-with-declare-target.f90 and
+    // declare-target-vars-in-target-region.f90 libomptarget
+    // tests.
+    unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
+                               : M.getTargetTriple().isAMDGPU()
+                                   ? 0
+                                   : DL.getDefaultGlobalsAddressSpace();
     auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
                        ? GlobalValue::InternalLinkage
                        : GlobalValue::CommonLinkage;

From dee0afa0484b356df0133b508578c438bd9ab897 Mon Sep 17 00:00:00 2001
From: Liu Ke <liuke.gehry@bytedance.com>
Date: Tue, 11 Nov 2025 11:46:34 +0800
Subject: [PATCH 58/97] [BOLT][DWARF] Slice .debug_str from the DWP for each CU
 (#159540)

Slice .debug_str from the DWP for each CU using .debug_str_offsets and
emit it, instead of directly copying the global .debug_str, in order to
address the bloat issue of DWO after updates. (more details here -
#155766 )
---
 bolt/include/bolt/Core/DebugData.h            |  12 +
 bolt/lib/Rewrite/DWARFRewriter.cpp            | 120 +++++-
 bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s | 330 ++++++++++++++++
 bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s | 368 ++++++++++++++++++
 .../X86/dwarf4-str-dwp-input-dwo-output.test  |  76 ++++
 .../X86/dwarf5-str-dwp-input-dwo-output.test  |  76 ++++
 6 files changed, 980 insertions(+), 2 deletions(-)
 create mode 100644 bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s
 create mode 100644 bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test
 create mode 100644 bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test

diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 7c8ea12ee3ee3..faf7bb62c6bee 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -471,6 +471,12 @@ class DebugStrOffsetsWriter {
     return std::move(StrOffsetsBuffer);
   }
 
+  /// Returns strings of .debug_str_offsets.
+  StringRef getBufferStr() {
+    return StringRef(reinterpret_cast<const char *>(StrOffsetsBuffer->data()),
+                     StrOffsetsBuffer->size());
+  }
+
   /// Initializes Buffer and Stream.
   void initialize(DWARFUnit &Unit);
 
@@ -507,6 +513,12 @@ class DebugStrWriter {
     return std::move(StrBuffer);
   }
 
+  /// Returns strings of .debug_str.
+  StringRef getBufferStr() {
+    return StringRef(reinterpret_cast<const char *>(StrBuffer->data()),
+                     StrBuffer->size());
+  }
+
   /// Adds string to .debug_str.
   /// On first invocation it initializes internal data structures.
   uint32_t addString(StringRef Str);
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 5e3fa931e826f..816acb229fec5 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1723,7 +1723,76 @@ StringRef getSectionName(const SectionRef &Section) {
   return Name;
 }
 
-// Extracts an appropriate slice if input is DWP.
+/// Extracts the slice of the .debug_str.dwo section for a given CU from a DWP
+/// file, based on the .debug_str_offsets.dwo section. This helps address DWO
+/// bloat that may occur after updates.
+///
+/// A slice of .debug_str.dwo may be composed of several non-contiguous
+/// fragments. These non-contiguous string views will be written out
+/// sequentially, avoiding the copying overhead caused by assembling them.
+///
+/// The .debug_str_offsets for the first CU often does not need to be updated,
+/// so copying is only performed when .debug_str_offsets requires updating.
+static void UpdateStrAndStrOffsets(StringRef StrDWOContent,
+                                   StringRef StrOffsetsContent,
+                                   SmallVectorImpl<StringRef> &StrDWOOutData,
+                                   std::string &StrOffsetsOutData,
+                                   unsigned DwarfVersion, bool IsLittleEndian) {
+  const llvm::endianness Endian =
+      IsLittleEndian ? llvm::endianness::little : llvm::endianness::big;
+  const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0;
+  constexpr size_t SizeOfOffset = sizeof(int32_t);
+  const uint64_t NumOffsets =
+      (StrOffsetsContent.size() - HeaderOffset) / SizeOfOffset;
+
+  DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0);
+  uint64_t ExtractionOffset = HeaderOffset;
+
+  using StringFragment = DWARFUnitIndex::Entry::SectionContribution;
+  const auto getStringLength = [](StringRef Content,
+                                  uint64_t Offset) -> uint64_t {
+    size_t NullPos = Content.find('\0', Offset);
+    return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0;
+  };
+  const auto isContiguous = [](const StringFragment &Fragment,
+                               uint64_t NextOffset) -> bool {
+    return NextOffset == Fragment.getOffset() + Fragment.getLength();
+  };
+  std::optional<StringFragment> CurrentFragment;
+  uint64_t AccumulatedStrLen = 0;
+  for (uint64_t I = 0; I < NumOffsets; ++I) {
+    const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset);
+    const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset);
+    if (!CurrentFragment) {
+      // First init.
+      CurrentFragment = StringFragment(StrOffset, StringLength);
+    } else {
+      if (isContiguous(*CurrentFragment, StrOffset)) {
+        // Expanding the current fragment.
+        CurrentFragment->setLength(CurrentFragment->getLength() + StringLength);
+      } else {
+        // Saving the current fragment and start a new one.
+        StrDWOOutData.push_back(StrDWOContent.substr(
+            CurrentFragment->getOffset(), CurrentFragment->getLength()));
+        CurrentFragment = StringFragment(StrOffset, StringLength);
+      }
+    }
+    if (AccumulatedStrLen != StrOffset) {
+      // Updating str offsets.
+      if (StrOffsetsOutData.empty())
+        StrOffsetsOutData = StrOffsetsContent.str();
+      llvm::support::endian::write32(
+          &StrOffsetsOutData[HeaderOffset + I * SizeOfOffset],
+          static_cast<uint32_t>(AccumulatedStrLen), Endian);
+    }
+    AccumulatedStrLen += StringLength;
+  }
+  if (CurrentFragment)
+    StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(),
+                                                 CurrentFragment->getLength()));
+}
+
+// Exctracts an appropriate slice if input is DWP.
 // Applies patches or overwrites the section.
 std::optional<StringRef> updateDebugData(
     DWARFContext &DWCtx, StringRef SectionName, StringRef SectionContents,
@@ -1772,6 +1841,8 @@ std::optional<StringRef> updateDebugData(
       errs() << "BOLT-WARNING: unsupported debug section: " << SectionName
              << "\n";
     if (StrWriter.isInitialized()) {
+      if (CUDWOEntry)
+        return StrWriter.getBufferStr();
       OutputBuffer = StrWriter.releaseBuffer();
       return StringRef(reinterpret_cast<const char *>(OutputBuffer->data()),
                        OutputBuffer->size());
@@ -1786,6 +1857,8 @@ std::optional<StringRef> updateDebugData(
   }
   case DWARFSectionKind::DW_SECT_STR_OFFSETS: {
     if (StrOffstsWriter.isFinalized()) {
+      if (CUDWOEntry)
+        return StrOffstsWriter.getBufferStr();
       OutputBuffer = StrOffstsWriter.releaseBuffer();
       return StringRef(reinterpret_cast<const char *>(OutputBuffer->data()),
                        OutputBuffer->size());
@@ -1888,6 +1961,10 @@ void DWARFRewriter::writeDWOFiles(
     }
   }
 
+  StringRef StrDWOContent;
+  StringRef StrOffsetsContent;
+  llvm::SmallVector<StringRef, 3> StrDWOOutData;
+  std::string StrOffsetsOutData;
   for (const SectionRef &Section : File->sections()) {
     std::unique_ptr<DebugBufferVector> OutputData;
     StringRef SectionName = getSectionName(Section);
@@ -1895,11 +1972,50 @@ void DWARFRewriter::writeDWOFiles(
       continue;
     Expected<StringRef> ContentsExp = Section.getContents();
     assert(ContentsExp && "Invalid contents.");
+    if (IsDWP && SectionName == "debug_str.dwo") {
+      if (StrWriter.isInitialized())
+        StrDWOContent = StrWriter.getBufferStr();
+      else
+        StrDWOContent = *ContentsExp;
+      continue;
+    }
     if (std::optional<StringRef> OutData = updateDebugData(
             (*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections,
             *Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter,
-            LocWriter, StrOffstsWriter, StrWriter, OverridenSections))
+            LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) {
+      if (IsDWP && SectionName == "debug_str_offsets.dwo") {
+        StrOffsetsContent = *OutData;
+        continue;
+      }
       Streamer->emitBytes(*OutData);
+    }
+  }
+
+  if (IsDWP) {
+    // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In
+    // the original DWP, .debug_str is a deduplicated global table, and the
+    // .debug_str.dwo slice for a single CU needs to be extracted according to
+    // .debug_str_offsets.dwo.
+    UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData,
+                           StrOffsetsOutData, CU.getVersion(),
+                           (*DWOCU)->getContext().isLittleEndian());
+    auto SectionIter = KnownSections.find("debug_str.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      for (size_t i = 0; i < StrDWOOutData.size(); ++i) {
+        StringRef OutData = StrDWOOutData[i];
+        if (!OutData.empty())
+          Streamer->emitBytes(OutData);
+      }
+    }
+    SectionIter = KnownSections.find("debug_str_offsets.dwo");
+    if (SectionIter != KnownSections.end()) {
+      Streamer->switchSection(SectionIter->second.first);
+      if (!StrOffsetsOutData.empty())
+        Streamer->emitBytes(StrOffsetsOutData);
+      else
+        Streamer->emitBytes(StrOffsetsContent);
+    }
   }
   Streamer->finish();
   TempOut->keep();
diff --git a/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s
new file mode 100644
index 0000000000000..cc951b689a5c6
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s
@@ -0,0 +1,330 @@
+#--- main.s
+# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S main.cpp
+# extern int getReturn();
+# int main() {
+#   return getReturn();
+# }
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           # main.cpp:2:0
+	.loc	1 3 10 prologue_end             # main.cpp:3:10
+	.loc	1 3 3 epilogue_begin is_stmt 0  # main.cpp:3:3
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	-9094791692727444213            # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main.dwo"                      # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"clang version 22.0.0" # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=30
+.Linfo_string4:
+	.asciz	"main.dwo"                      # string offset=39
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	30
+	.long	39
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_GNU_dwo_name
+	.quad	-9094791692727444213            # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	40                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z9getReturnv
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- helper.s
+# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S helper.cpp
+# int getReturn() {
+#   return 0;
+# }
+	.file	"helper.cpp"
+	.globl	_Z9getReturnv                   # -- Begin function _Z9getReturnv
+	.type	_Z9getReturnv,@function
+_Z9getReturnv:                          # @_Z9getReturnv
+.Lfunc_begin0:
+	.file	1 "." "helper.cpp"
+	.loc	1 1 0                           # helper.cpp:1:0
+	.loc	1 2 3 prologue_end              # helper.cpp:2:3
+	.loc	1 2 3 epilogue_begin is_stmt 0  # helper.cpp:2:3
+	retq
+.Lfunc_end0:
+	.size	_Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	5976014880088676049             # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"helper.dwo"                    # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z9getReturnv"                 # string offset=0
+.Linfo_string1:
+	.asciz	"getReturn"                     # string offset=14
+.Linfo_string2:
+	.asciz	"int"                           # string offset=24
+.Linfo_string3:
+	.asciz	"clang version 22.0.0" # string offset=28
+.Linfo_string4:
+	.asciz	"helper.cpp"                    # string offset=49
+.Linfo_string5:
+	.asciz	"helper.dwo"                    # string offset=60
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	14
+	.long	24
+	.long	28
+	.long	49
+	.long	60
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x23 DW_TAG_compile_unit
+	.byte	3                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_GNU_dwo_name
+	.quad	5976014880088676049             # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0x10 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s
new file mode 100644
index 0000000000000..5e938ea98bf95
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s
@@ -0,0 +1,368 @@
+#--- main.s
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S main.cpp
+# extern int getReturn();
+# int main() {
+#   return getReturn();
+# }
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad
+	.loc	0 2 0                           # main.cpp:2:0
+	.loc	0 3 10 prologue_end             # main.cpp:3:10
+	.loc	0 3 3 epilogue_begin is_stmt 0  # main.cpp:3:3
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-9094791692727444213
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main.dwo"                      # string offset=2
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	24                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"clang version 22.0.0" # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=30
+.Linfo_string4:
+	.asciz	"main.dwo"                      # string offset=39
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	30
+	.long	39
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-9094791692727444213
+	.byte	1                               # Abbrev [1] 0x14:0x1a DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z9getReturnv
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- helper.s
+# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S helper.cpp
+# int getReturn() {
+#   return 0;
+# }
+	.file	"helper.cpp"
+	.globl	_Z9getReturnv                   # -- Begin function _Z9getReturnv
+	.type	_Z9getReturnv,@function
+_Z9getReturnv:                          # @_Z9getReturnv
+.Lfunc_begin0:
+	.file	0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d
+	.loc	0 1 0                           # helper.cpp:1:0
+	.loc	0 2 3 prologue_end              # helper.cpp:2:3
+	.loc	0 2 3 epilogue_begin is_stmt 0  # helper.cpp:2:3
+	retq
+.Lfunc_end0:
+	.size	_Z9getReturnv, .Lfunc_end0-_Z9getReturnv
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"helper.dwo"                    # string offset=2
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string1
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	28                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z9getReturnv"                 # string offset=0
+.Linfo_string1:
+	.asciz	"getReturn"                     # string offset=14
+.Linfo_string2:
+	.asciz	"int"                           # string offset=24
+.Linfo_string3:
+	.asciz	"clang version 22.0.0" # string offset=28
+.Linfo_string4:
+	.asciz	"helper.cpp"                    # string offset=49
+.Linfo_string5:
+	.asciz	"helper.dwo"                    # string offset=60
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	14
+	.long	24
+	.long	28
+	.long	49
+	.long	60
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5976014880088676049
+	.byte	1                               # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit
+	.byte	3                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	42                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x2a:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.ident	"clang version 22.0.0"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test
new file mode 100644
index 0000000000000..a0e8721374a87
--- /dev/null
+++ b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test
@@ -0,0 +1,76 @@
+; RUN: split-file %p/Inputs/dwarf4-str-split-dwarf.s %t
+; RUN: cd %t
+; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj main.s -o=main.o
+; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj helper.s -o=helper.o
+; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-dwp -e main.exe -o main.exe.dwp
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s
+
+;; For DWARF4, this test checks that strings are split correctly from a combined
+;; section in DWP file, into appropriate .dwo files.
+
+; PRE-BOLT-STR: 0x00000000: "main"
+; PRE-BOLT-STR: 0x00000005: "int"
+; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0"
+; PRE-BOLT-STR: 0x0000001e: "main.cpp"
+; PRE-BOLT-STR: 0x00000027: "main.dwo"
+; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv"
+; PRE-BOLT-STR: 0x0000003e: "getReturn"
+; PRE-BOLT-STR: 0x00000048: "helper.cpp"
+; PRE-BOLT-STR: 0x00000053: "helper.dwo"
+
+; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4
+; PRE-BOLT-STR-OFFSETS: 0x00000000: 00000000 "main"
+; PRE-BOLT-STR-OFFSETS: 0x00000004: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000027 "main.dwo"
+; PRE-BOLT-STR-OFFSETS: 0x00000014: Contribution size = 24, Format = DWARF32, Version = 4
+; PRE-BOLT-STR-OFFSETS: 0x00000014: 00000030 "_Z9getReturnv"
+; PRE-BOLT-STR-OFFSETS: 0x00000018: 0000003e "getReturn"
+; PRE-BOLT-STR-OFFSETS: 0x0000001c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000020: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000048 "helper.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000028: 00000053 "helper.dwo"
+
+; BOLT-MAIN-STR: 0x00000000: "main"
+; BOLT-MAIN-STR: 0x00000005: "int"
+; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0"
+; BOLT-MAIN-STR: 0x0000001e: "main.cpp"
+; BOLT-MAIN-STR: 0x00000027: "main.dwo"
+
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: 00000000 "main"
+; BOLT-MAIN-STR-OFFSETS: 0x00000004: 00000005 "int"
+; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0"
+; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp"
+; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000027 "main.dwo"
+
+; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv"
+; BOLT-HELPER-STR: 0x0000000e: "getReturn"
+; BOLT-HELPER-STR: 0x00000018: "int"
+; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0"
+; BOLT-HELPER-STR: 0x00000031: "helper.cpp"
+; BOLT-HELPER-STR: 0x0000003c: "helper.dwo"
+
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 4
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: 00000000 "_Z9getReturnv"
+; BOLT-HELPER-STR-OFFSETS: 0x00000004: 0000000e "getReturn"
+; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000018 "int"
+; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000001c "clang version 22.0.0"
+; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000031 "helper.cpp"
+; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000003c "helper.dwo"
diff --git a/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test
new file mode 100644
index 0000000000000..2e72c6a808924
--- /dev/null
+++ b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test
@@ -0,0 +1,76 @@
+; RUN: split-file %p/Inputs/dwarf5-str-split-dwarf.s %t
+; RUN: cd %t
+; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj main.s -o=main.o
+; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
+; RUN: --filetype=obj helper.s -o=helper.o
+; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-dwp -e main.exe -o main.exe.dwp
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \
+; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \
+; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s
+
+;; For DWARF5, this test checks that strings are split correctly from a combined
+;; section in DWP file, into appropriate .dwo files.
+
+; PRE-BOLT-STR: 0x00000000: "main"
+; PRE-BOLT-STR: 0x00000005: "int"
+; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0"
+; PRE-BOLT-STR: 0x0000001e: "main.cpp"
+; PRE-BOLT-STR: 0x00000027: "main.dwo"
+; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv"
+; PRE-BOLT-STR: 0x0000003e: "getReturn"
+; PRE-BOLT-STR: 0x00000048: "helper.cpp"
+; PRE-BOLT-STR: 0x00000053: "helper.dwo"
+
+; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5
+; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000000 "main"
+; PRE-BOLT-STR-OFFSETS: 0x0000000c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000014: 0000001e "main.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000018: 00000027 "main.dwo"
+; PRE-BOLT-STR-OFFSETS: 0x0000001c: Contribution size = 28, Format = DWARF32, Version = 5
+; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000030 "_Z9getReturnv"
+; PRE-BOLT-STR-OFFSETS: 0x00000028: 0000003e "getReturn"
+; PRE-BOLT-STR-OFFSETS: 0x0000002c: 00000005 "int"
+; PRE-BOLT-STR-OFFSETS: 0x00000030: 00000009 "clang version 22.0.0"
+; PRE-BOLT-STR-OFFSETS: 0x00000034: 00000048 "helper.cpp"
+; PRE-BOLT-STR-OFFSETS: 0x00000038: 00000053 "helper.dwo"
+
+; BOLT-MAIN-STR: 0x00000000: "main"
+; BOLT-MAIN-STR: 0x00000005: "int"
+; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0"
+; BOLT-MAIN-STR: 0x0000001e: "main.cpp"
+; BOLT-MAIN-STR: 0x00000027: "main.dwo"
+
+; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5
+; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000000 "main"
+; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 00000005 "int"
+; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0"
+; BOLT-MAIN-STR-OFFSETS: 0x00000014: 0000001e "main.cpp"
+; BOLT-MAIN-STR-OFFSETS: 0x00000018: 00000027 "main.dwo"
+
+; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv"
+; BOLT-HELPER-STR: 0x0000000e: "getReturn"
+; BOLT-HELPER-STR: 0x00000018: "int"
+; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0"
+; BOLT-HELPER-STR: 0x00000031: "helper.cpp"
+; BOLT-HELPER-STR: 0x0000003c: "helper.dwo"
+
+; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5
+; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000000 "_Z9getReturnv"
+; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000000e "getReturn"
+; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000018 "int"
+; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000001c "clang version 22.0.0"
+; BOLT-HELPER-STR-OFFSETS: 0x00000018: 00000031 "helper.cpp"
+; BOLT-HELPER-STR-OFFSETS: 0x0000001c: 0000003c "helper.dwo"

From fc69bfb0ef69db30c232ec76b1b7da0007b2dcd1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 20:01:42 -0800
Subject: [PATCH 59/97] Hexagon: Enable terminal rule (#165960)

I had to hack many hexagon tests to disable the rule. I have
no idea how to update these tests. They appear to be testing specific
scheduling and packet formation of later machine passes, so any change
in the incoming mir is likely hiding whatever was originally intended.
I'll open an issue to fixup these tests once this lands.
---
 llvm/lib/Target/Hexagon/HexagonSubtarget.h      | 2 ++
 llvm/test/CodeGen/Hexagon/late_instr.ll         | 2 +-
 llvm/test/CodeGen/Hexagon/swp-carried-1.ll      | 2 +-
 llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll | 2 +-
 llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll   | 2 +-
 llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll   | 2 +-
 llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll    | 2 +-
 llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll    | 2 +-
 llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll  | 2 +-
 llvm/test/CodeGen/Hexagon/swp-order-copies.ll   | 2 +-
 llvm/test/CodeGen/Hexagon/swp-order-deps7.ll    | 2 +-
 llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll    | 2 +-
 12 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 30794f61218a1..7dfede249c63c 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -294,6 +294,8 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
 
+  bool enableTerminalRule() const override { return true; }
+
   // Always use the TargetLowering default scheduler.
   // FIXME: This will use the vliw scheduler which is probably just hurting
   // compiler time and will be removed eventually anyway.
diff --git a/llvm/test/CodeGen/Hexagon/late_instr.ll b/llvm/test/CodeGen/Hexagon/late_instr.ll
index 93e5a7dba4b3b..6bd1261ed83d5 100644
--- a/llvm/test/CodeGen/Hexagon/late_instr.ll
+++ b/llvm/test/CodeGen/Hexagon/late_instr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -disable-hsdr < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -disable-hsdr -terminal-rule=0 < %s | FileCheck %s
 
 ; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets.
 ; CHECK: or(q{{[0-3]+}},q{{[0-3]+}})
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
index 6993bd672c01a..f2beadfbfa64b 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that we generate the correct code when a loop carried value
 ; is scheduled one stage earlier than it's use. The code in
diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
index 006a8b6bfc94a..69b89a680ff5a 100644
--- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; This version of the conv3x3 test has both loops. This test checks that the
 ; inner loop has 14 packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
index d1b9c51c45a2d..0466b6df46142 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s | FileCheck %s
 
 ; Test that the pipeliner correctly generates the operands in the
 ; epilog.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
index ba479b696f16c..c6631bd9dc16d 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis -terminal-rule=0 < %s | FileCheck %s
 
 ; Test epilogue generation when reading loop-carried dependency from a previous
 ; stage. The first epilogue should read value from iteration N-1 of the kernel.
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
index 96a38939dc50e..d90e7c4cde1ca 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 -terminal-rule=0 < %s | FileCheck %s
 
 ; For the Phis generated in the epilog, test that we generate the correct
 ; names for the values coming from the prolog stages. The test belows
diff --git a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
index 6ca8e94200b7d..2a428ff941a71 100644
--- a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
 
 ; Test that we generate the correct names for the phis in the kernel for the
 ; incoming values. In this case, the loop contains a phi and has another phi
diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index 42efe60b96d48..a0aeb80a5fa93 100644
--- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner -terminal-rule=0 < %s | FileCheck %s
 
 ; From coremark. Test that we pipeline the matrix multiplication bitextract
 ; function. The pipelined code should have two packets.
diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
index 1c9cc4a1cf9d8..bbaa8cd635f3e 100644
--- a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the instruction ordering code in the pipeliner fixes up dependences
 ; between post-increment register definitions and uses so that the register
diff --git a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
index 5f1780fce39d2..38893de0b0829 100644
--- a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the pipeliner cause an assert and correctly pipelines the
 ; loop.
diff --git a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
index 6c8b0638ae5d1..5189812d522c6 100644
--- a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s
 
 ; Test that the pipeliner generates correct code when attempting to reuse
 ; an existing phi. This test case contains a phi that references another

From cf91088195b33c87e5f5de62defa21bc81c5f18a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 20:08:33 -0800
Subject: [PATCH 60/97] RISCV: Enable terminal rule (#165961)

---
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |  1 +
 llvm/test/CodeGen/RISCV/branch-on-zero.ll     | 16 ++---
 llvm/test/CodeGen/RISCV/machine-pipeliner.ll  | 46 ++++++-------
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 10 ++-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll        | 43 ++++++------
 llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll   | 66 +++++++++----------
 .../CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll   | 28 ++++----
 .../RISCV/rvv/vxrm-insert-out-of-loop.ll      | 24 +++----
 8 files changed, 114 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index d5ffa6c812cec..4026364c90e6c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -146,6 +146,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
 
   bool enableMachineScheduler() const override { return true; }
+  bool enableTerminalRule() const override { return true; }
 
   bool enablePostRAScheduler() const override { return UsePostRAScheduler; }
 
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb3775..2aec92eca145f 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -127,13 +127,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV32-NEXT:  .LBB3_2: # %while.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lw a3, 0(a1)
-; RV32-NEXT:    addi a4, a1, 4
+; RV32-NEXT:    addi a1, a1, 4
 ; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    addi a1, a0, 4
 ; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    mv a1, a4
-; RV32-NEXT:    bne a4, a2, .LBB3_2
+; RV32-NEXT:    addi a0, a0, 4
+; RV32-NEXT:    bne a1, a2, .LBB3_2
 ; RV32-NEXT:  .LBB3_3: # %while.end
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -151,13 +149,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV64-NEXT:  .LBB3_2: # %while.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    lw a3, 0(a1)
-; RV64-NEXT:    addi a4, a1, 4
+; RV64-NEXT:    addi a1, a1, 4
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    addi a1, a0, 4
 ; RV64-NEXT:    sw a3, 0(a0)
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a4
-; RV64-NEXT:    bne a4, a2, .LBB3_2
+; RV64-NEXT:    addi a0, a0, 4
+; RV64-NEXT:    bne a1, a2, .LBB3_2
 ; RV64-NEXT:  .LBB3_3: # %while.end
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
index d250098576687..a2a7da7e2d6ef 100644
--- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -54,37 +54,37 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn
 ; CHECK-PIPELINED:       # %bb.0: # %entry
 ; CHECK-PIPELINED-NEXT:    blez a2, .LBB1_6
 ; CHECK-PIPELINED-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-PIPELINED-NEXT:    lw a4, 0(a1)
+; CHECK-PIPELINED-NEXT:    lw a7, 0(a1)
 ; CHECK-PIPELINED-NEXT:    addi a2, a2, -1
+; CHECK-PIPELINED-NEXT:    addi a3, a0, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a1, 4
 ; CHECK-PIPELINED-NEXT:    sh2add.uw a6, a2, a1
-; CHECK-PIPELINED-NEXT:    addi a2, a0, 4
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
 ; CHECK-PIPELINED-NEXT:    addi a6, a6, 4
-; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_5
+; CHECK-PIPELINED-NEXT:    beq a5, a6, .LBB1_5
 ; CHECK-PIPELINED-NEXT:  # %bb.2: # %for.body
-; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    addi a3, a2, 4
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
-; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_4
+; CHECK-PIPELINED-NEXT:    lw a1, 0(a5)
+; CHECK-PIPELINED-NEXT:    addi a4, a3, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a5, 4
+; CHECK-PIPELINED-NEXT:    beq a5, a6, .LBB1_4
 ; CHECK-PIPELINED-NEXT:  .LBB1_3: # %for.body
 ; CHECK-PIPELINED-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT:    mv a4, a5
-; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
-; CHECK-PIPELINED-NEXT:    mv a0, a2
-; CHECK-PIPELINED-NEXT:    mv a2, a3
-; CHECK-PIPELINED-NEXT:    addi a3, a3, 4
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
-; CHECK-PIPELINED-NEXT:    bne a1, a6, .LBB1_3
+; CHECK-PIPELINED-NEXT:    addi a2, a7, 1
+; CHECK-PIPELINED-NEXT:    mv a7, a1
+; CHECK-PIPELINED-NEXT:    lw a1, 0(a5)
+; CHECK-PIPELINED-NEXT:    sw a2, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a0, a3
+; CHECK-PIPELINED-NEXT:    mv a3, a4
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 4
+; CHECK-PIPELINED-NEXT:    addi a5, a5, 4
+; CHECK-PIPELINED-NEXT:    bne a5, a6, .LBB1_3
 ; CHECK-PIPELINED-NEXT:  .LBB1_4:
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
-; CHECK-PIPELINED-NEXT:    mv a0, a2
-; CHECK-PIPELINED-NEXT:    mv a4, a5
+; CHECK-PIPELINED-NEXT:    addi a7, a7, 1
+; CHECK-PIPELINED-NEXT:    sw a7, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a0, a3
+; CHECK-PIPELINED-NEXT:    mv a7, a1
 ; CHECK-PIPELINED-NEXT:  .LBB1_5:
-; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
-; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:    addi a7, a7, 1
+; CHECK-PIPELINED-NEXT:    sw a7, 0(a0)
 ; CHECK-PIPELINED-NEXT:  .LBB1_6: # %for.end
 ; CHECK-PIPELINED-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index 9c6d77dde1b5c..c3fe6b335d3da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -44,9 +44,8 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_with_tail:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vmv1r.v v11, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vrgather.vi v8, v10, 0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
@@ -99,9 +98,8 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_identity:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 0
-; CHECK-NEXT:    vmv1r.v v11, v9
-; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vrgather.vi v8, v10, 0
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index ab9849631663c..a4c793b49d54a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -36,7 +36,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    .cfi_offset s10, -96
 ; CHECK-NEXT:    .cfi_offset s11, -104
 ; CHECK-NEXT:    li a6, 0
-; CHECK-NEXT:    li s2, 8
+; CHECK-NEXT:    li a7, 8
 ; CHECK-NEXT:    li t0, 12
 ; CHECK-NEXT:    li s0, 4
 ; CHECK-NEXT:    li t1, 20
@@ -45,7 +45,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    andi t3, a4, 1
-; CHECK-NEXT:    li t2, 4
+; CHECK-NEXT:    li s2, 4
 ; CHECK-NEXT:  .LBB0_1: # %for.cond1.preheader.i
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
@@ -53,9 +53,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv t4, t1
-; CHECK-NEXT:    mv t5, t2
+; CHECK-NEXT:    mv t2, s2
 ; CHECK-NEXT:    mv t6, t0
-; CHECK-NEXT:    mv a7, s2
+; CHECK-NEXT:    mv s3, a7
 ; CHECK-NEXT:    mv s4, a6
 ; CHECK-NEXT:  .LBB0_2: # %for.cond5.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -64,9 +64,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv s5, t4
-; CHECK-NEXT:    mv s6, t5
+; CHECK-NEXT:    mv t5, t2
 ; CHECK-NEXT:    mv s7, t6
-; CHECK-NEXT:    mv s3, a7
+; CHECK-NEXT:    mv s8, s3
 ; CHECK-NEXT:    mv s9, s4
 ; CHECK-NEXT:  .LBB0_3: # %for.cond9.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -75,9 +75,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Child Loop BB0_4 Depth 4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
 ; CHECK-NEXT:    mv s11, s5
-; CHECK-NEXT:    mv a3, s6
+; CHECK-NEXT:    mv s6, t5
 ; CHECK-NEXT:    mv ra, s7
-; CHECK-NEXT:    mv s8, s3
+; CHECK-NEXT:    mv a5, s8
 ; CHECK-NEXT:    mv s1, s9
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -92,45 +92,44 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=3
 ; CHECK-NEXT:    # Parent Loop BB0_4 Depth=4
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=5
-; CHECK-NEXT:    addi a5, a1, 4
-; CHECK-NEXT:    add a4, s8, a1
-; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    add a4, a5, a1
+; CHECK-NEXT:    add a3, s6, a1
+; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    vse32.v v8, (a4), v0.t
-; CHECK-NEXT:    vse32.v v8, (a1), v0.t
-; CHECK-NEXT:    mv a1, a5
-; CHECK-NEXT:    bne a5, s0, .LBB0_5
+; CHECK-NEXT:    vse32.v v8, (a3), v0.t
+; CHECK-NEXT:    bne a1, s0, .LBB0_5
 ; CHECK-NEXT:  # %bb.6: # %for.cond.cleanup15.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=4
 ; CHECK-NEXT:    addi s1, s1, 4
-; CHECK-NEXT:    addi s8, s8, 4
+; CHECK-NEXT:    addi a5, a5, 4
 ; CHECK-NEXT:    addi ra, ra, 4
-; CHECK-NEXT:    addi a3, a3, 4
+; CHECK-NEXT:    addi s6, s6, 4
 ; CHECK-NEXT:    andi s10, a0, 1
 ; CHECK-NEXT:    addi s11, s11, 4
 ; CHECK-NEXT:    beqz s10, .LBB0_4
 ; CHECK-NEXT:  # %bb.7: # %for.cond.cleanup11.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=3
 ; CHECK-NEXT:    addi s9, s9, 4
-; CHECK-NEXT:    addi s3, s3, 4
+; CHECK-NEXT:    addi s8, s8, 4
 ; CHECK-NEXT:    addi s7, s7, 4
-; CHECK-NEXT:    addi s6, s6, 4
+; CHECK-NEXT:    addi t5, t5, 4
 ; CHECK-NEXT:    andi a1, a2, 1
 ; CHECK-NEXT:    addi s5, s5, 4
 ; CHECK-NEXT:    beqz a1, .LBB0_3
 ; CHECK-NEXT:  # %bb.8: # %for.cond.cleanup7.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=2
 ; CHECK-NEXT:    addi s4, s4, 4
-; CHECK-NEXT:    addi a7, a7, 4
+; CHECK-NEXT:    addi s3, s3, 4
 ; CHECK-NEXT:    addi t6, t6, 4
-; CHECK-NEXT:    addi t5, t5, 4
+; CHECK-NEXT:    addi t2, t2, 4
 ; CHECK-NEXT:    addi t4, t4, 4
 ; CHECK-NEXT:    beqz t3, .LBB0_2
 ; CHECK-NEXT:  # %bb.9: # %for.cond.cleanup3.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    addi a6, a6, 4
-; CHECK-NEXT:    addi s2, s2, 4
+; CHECK-NEXT:    addi a7, a7, 4
 ; CHECK-NEXT:    addi t0, t0, 4
-; CHECK-NEXT:    addi t2, t2, 4
+; CHECK-NEXT:    addi s2, s2, 4
 ; CHECK-NEXT:    addi t1, t1, 4
 ; CHECK-NEXT:    beqz a1, .LBB0_1
 ; CHECK-NEXT:  # %bb.10: # %l.exit
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index f295bd8d74df3..386c736128794 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2258,18 +2258,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-RV32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-RV32-NEXT:    slli a7, a6, 2
-; CHECK-RV32-NEXT:    add t0, a6, a4
-; CHECK-RV32-NEXT:    add a7, a0, a7
-; CHECK-RV32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-RV32-NEXT:    sltu a6, t0, a6
-; CHECK-RV32-NEXT:    add a5, a5, a6
-; CHECK-RV32-NEXT:    xor a6, t0, a3
+; CHECK-RV32-NEXT:    mv a7, a6
+; CHECK-RV32-NEXT:    slli t0, a6, 2
+; CHECK-RV32-NEXT:    add a6, a6, a4
+; CHECK-RV32-NEXT:    add t0, a0, t0
+; CHECK-RV32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-RV32-NEXT:    sltu a7, a6, a7
+; CHECK-RV32-NEXT:    add a5, a5, a7
+; CHECK-RV32-NEXT:    xor a7, a6, a3
 ; CHECK-RV32-NEXT:    vand.vx v8, v8, a1
-; CHECK-RV32-NEXT:    or t1, a6, a5
-; CHECK-RV32-NEXT:    vs2r.v v8, (a7)
-; CHECK-RV32-NEXT:    mv a6, t0
-; CHECK-RV32-NEXT:    bnez t1, .LBB98_3
+; CHECK-RV32-NEXT:    or a7, a7, a5
+; CHECK-RV32-NEXT:    vs2r.v v8, (t0)
+; CHECK-RV32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-RV32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-RV32-NEXT:    bnez a3, .LBB98_6
 ; CHECK-RV32-NEXT:  .LBB98_5: # %for.body
@@ -2350,18 +2350,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-ZVKB-NOZBB32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-NOZBB32-NEXT:    slli a7, a6, 2
-; CHECK-ZVKB-NOZBB32-NEXT:    add t0, a6, a4
-; CHECK-ZVKB-NOZBB32-NEXT:    add a7, a0, a7
-; CHECK-ZVKB-NOZBB32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-ZVKB-NOZBB32-NEXT:    sltu a6, t0, a6
-; CHECK-ZVKB-NOZBB32-NEXT:    add a5, a5, a6
-; CHECK-ZVKB-NOZBB32-NEXT:    xor a6, t0, a3
+; CHECK-ZVKB-NOZBB32-NEXT:    mv a7, a6
+; CHECK-ZVKB-NOZBB32-NEXT:    slli t0, a6, 2
+; CHECK-ZVKB-NOZBB32-NEXT:    add a6, a6, a4
+; CHECK-ZVKB-NOZBB32-NEXT:    add t0, a0, t0
+; CHECK-ZVKB-NOZBB32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-ZVKB-NOZBB32-NEXT:    sltu a7, a6, a7
+; CHECK-ZVKB-NOZBB32-NEXT:    add a5, a5, a7
+; CHECK-ZVKB-NOZBB32-NEXT:    xor a7, a6, a3
 ; CHECK-ZVKB-NOZBB32-NEXT:    vandn.vx v8, v8, a1
-; CHECK-ZVKB-NOZBB32-NEXT:    or t1, a6, a5
-; CHECK-ZVKB-NOZBB32-NEXT:    vs2r.v v8, (a7)
-; CHECK-ZVKB-NOZBB32-NEXT:    mv a6, t0
-; CHECK-ZVKB-NOZBB32-NEXT:    bnez t1, .LBB98_3
+; CHECK-ZVKB-NOZBB32-NEXT:    or a7, a7, a5
+; CHECK-ZVKB-NOZBB32-NEXT:    vs2r.v v8, (t0)
+; CHECK-ZVKB-NOZBB32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-ZVKB-NOZBB32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-ZVKB-NOZBB32-NEXT:    bnez a3, .LBB98_7
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_5: # %for.body.preheader
@@ -2444,18 +2444,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_3: # %vector.body
 ; CHECK-ZVKB-ZBB32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-ZVKB-ZBB32-NEXT:    slli a7, a6, 2
-; CHECK-ZVKB-ZBB32-NEXT:    add t0, a6, a4
-; CHECK-ZVKB-ZBB32-NEXT:    add a7, a0, a7
-; CHECK-ZVKB-ZBB32-NEXT:    vl2re32.v v8, (a7)
-; CHECK-ZVKB-ZBB32-NEXT:    sltu a6, t0, a6
-; CHECK-ZVKB-ZBB32-NEXT:    add a5, a5, a6
-; CHECK-ZVKB-ZBB32-NEXT:    xor a6, t0, a3
+; CHECK-ZVKB-ZBB32-NEXT:    mv a7, a6
+; CHECK-ZVKB-ZBB32-NEXT:    slli t0, a6, 2
+; CHECK-ZVKB-ZBB32-NEXT:    add a6, a6, a4
+; CHECK-ZVKB-ZBB32-NEXT:    add t0, a0, t0
+; CHECK-ZVKB-ZBB32-NEXT:    vl2re32.v v8, (t0)
+; CHECK-ZVKB-ZBB32-NEXT:    sltu a7, a6, a7
+; CHECK-ZVKB-ZBB32-NEXT:    add a5, a5, a7
+; CHECK-ZVKB-ZBB32-NEXT:    xor a7, a6, a3
 ; CHECK-ZVKB-ZBB32-NEXT:    vandn.vx v8, v8, a1
-; CHECK-ZVKB-ZBB32-NEXT:    or t1, a6, a5
-; CHECK-ZVKB-ZBB32-NEXT:    vs2r.v v8, (a7)
-; CHECK-ZVKB-ZBB32-NEXT:    mv a6, t0
-; CHECK-ZVKB-ZBB32-NEXT:    bnez t1, .LBB98_3
+; CHECK-ZVKB-ZBB32-NEXT:    or a7, a7, a5
+; CHECK-ZVKB-ZBB32-NEXT:    vs2r.v v8, (t0)
+; CHECK-ZVKB-ZBB32-NEXT:    bnez a7, .LBB98_3
 ; CHECK-ZVKB-ZBB32-NEXT:  # %bb.4: # %middle.block
 ; CHECK-ZVKB-ZBB32-NEXT:    bnez a3, .LBB98_6
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_5: # %for.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
index ed6b7f1e6efb8..10440089cff10 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll
@@ -25,24 +25,24 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc
 ; RV32-NEXT:    li a6, 0
 ; RV32-NEXT:  .LBB0_4: # %vector.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    slli t0, a7, 2
-; RV32-NEXT:    addi t1, a7, 8
-; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    mv t0, a7
+; RV32-NEXT:    slli t1, a7, 2
+; RV32-NEXT:    addi a7, a7, 8
+; RV32-NEXT:    add t1, a1, t1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v8, (t0)
-; RV32-NEXT:    sltu a7, t1, a7
-; RV32-NEXT:    xor t0, t1, a5
-; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    vle32.v v8, (t1)
+; RV32-NEXT:    sltu t0, a7, t0
+; RV32-NEXT:    xor t1, a7, a5
+; RV32-NEXT:    add a6, a6, t0
 ; RV32-NEXT:    vmslt.vx v12, v8, a2
 ; RV32-NEXT:    vcompress.vm v10, v8, v12
-; RV32-NEXT:    vcpop.m a7, v12
-; RV32-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; RV32-NEXT:    vcpop.m t0, v12
+; RV32-NEXT:    vsetvli zero, t0, e32, m2, ta, ma
 ; RV32-NEXT:    vse32.v v10, (a0)
-; RV32-NEXT:    slli a7, a7, 2
-; RV32-NEXT:    or t0, t0, a6
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    mv a7, t1
-; RV32-NEXT:    bnez t0, .LBB0_4
+; RV32-NEXT:    slli t0, t0, 2
+; RV32-NEXT:    or t1, t1, a6
+; RV32-NEXT:    add a0, a0, t0
+; RV32-NEXT:    bnez t1, .LBB0_4
 ; RV32-NEXT:  # %bb.5: # %middle.block
 ; RV32-NEXT:    bne a5, a3, .LBB0_9
 ; RV32-NEXT:  .LBB0_6: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index ead79fcf53d8b..af3b0852a6461 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -102,20 +102,20 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV32-NEXT:  .LBB0_13: # %vector.body
 ; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
 ; RV32-NEXT:    # => This Inner Loop Header: Depth=2
-; RV32-NEXT:    add s0, a2, t6
-; RV32-NEXT:    add s1, a4, t6
-; RV32-NEXT:    vl2r.v v8, (s0)
-; RV32-NEXT:    add s0, a0, t6
+; RV32-NEXT:    mv s0, t6
+; RV32-NEXT:    add t6, a2, t6
+; RV32-NEXT:    add s1, a4, s0
+; RV32-NEXT:    vl2r.v v8, (t6)
+; RV32-NEXT:    add s2, a0, s0
 ; RV32-NEXT:    vl2r.v v10, (s1)
-; RV32-NEXT:    add s1, t6, t2
-; RV32-NEXT:    sltu t6, s1, t6
-; RV32-NEXT:    add t5, t5, t6
-; RV32-NEXT:    xor t6, s1, t4
+; RV32-NEXT:    add t6, s0, t2
+; RV32-NEXT:    sltu s0, t6, s0
+; RV32-NEXT:    add t5, t5, s0
+; RV32-NEXT:    xor s0, t6, t4
 ; RV32-NEXT:    vaaddu.vv v8, v8, v10
-; RV32-NEXT:    or s2, t6, t5
-; RV32-NEXT:    vs2r.v v8, (s0)
-; RV32-NEXT:    mv t6, s1
-; RV32-NEXT:    bnez s2, .LBB0_13
+; RV32-NEXT:    or s0, s0, t5
+; RV32-NEXT:    vs2r.v v8, (s2)
+; RV32-NEXT:    bnez s0, .LBB0_13
 ; RV32-NEXT:  # %bb.14: # %middle.block
 ; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
 ; RV32-NEXT:    beq t4, a6, .LBB0_9

From faf9ac0f6fc284e26515c55787cefd6ec807ab36 Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <67495422+kaviya2510@users.noreply.github.com>
Date: Tue, 11 Nov 2025 09:49:14 +0530
Subject: [PATCH 61/97] [Flang][MLIR][OpenMP] Add MLIR lowering support for
 taskloop clauses. (#165851)

This patch add MLIR lowering support for the following taskloop clauses:

1. Default clause
2. Shared clause
3. Allocate clause
4. Final clause
5. If clause
6. Mergeable clause
7. Priority clause
8. Untied clause
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  14 +-
 .../Lower/OpenMP/Todo/taskloop-collapse.f90   |  15 ++
 .../OpenMP/Todo/taskloop-lastprivate.f90      |  13 ++
 .../Lower/OpenMP/Todo/taskloop-nogroup.f90    |  13 ++
 flang/test/Lower/OpenMP/if-clause.f90         |  26 +++-
 flang/test/Lower/OpenMP/implicit-dsa.f90      | 128 ++++++++++++++++-
 flang/test/Lower/OpenMP/taskloop.f90          | 129 +++++++++++++++++-
 7 files changed, 329 insertions(+), 9 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4429a9a94ed3c..99470d0bbd2bf 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1769,14 +1769,18 @@ static void genTaskloopClauses(lower::AbstractConverter &converter,
                                mlir::omp::TaskloopOperands &clauseOps) {
 
   ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processFinal(stmtCtx, clauseOps);
   cp.processGrainsize(stmtCtx, clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_taskloop, clauseOps);
+  cp.processMergeable(clauseOps);
   cp.processNumTasks(stmtCtx, clauseOps);
+  cp.processPriority(stmtCtx, clauseOps);
+  cp.processUntied(clauseOps);
 
-  cp.processTODO<clause::Allocate, clause::Collapse, clause::Default,
-                 clause::Final, clause::If, clause::InReduction,
-                 clause::Lastprivate, clause::Mergeable, clause::Nogroup,
-                 clause::Priority, clause::Reduction, clause::Shared,
-                 clause::Untied>(loc, llvm::omp::Directive::OMPD_taskloop);
+  cp.processTODO<clause::Collapse, clause::InReduction, clause::Lastprivate,
+                 clause::Nogroup, clause::Reduction>(
+      loc, llvm::omp::Directive::OMPD_taskloop);
 }
 
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
new file mode 100644
index 0000000000000..cd54f5eeba6c4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90
@@ -0,0 +1,15 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause COLLAPSE in TASKLOOP construct
+subroutine omp_taskloop_collapse()
+   integer x
+   x = 0
+   !$omp taskloop collapse(2)
+   do i = 1, 100
+     do j = 1, 100
+      x = x + 1
+     end do
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_collapse
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90
new file mode 100644
index 0000000000000..54f2580daf283
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause LASTPRIVATE in TASKLOOP construct
+subroutine omp_taskloop_lastprivate()
+   integer x
+   x = 0
+   !$omp taskloop lastprivate(x)
+   do i = 1, 100
+      x = x + 1
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_lastprivate
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90
new file mode 100644
index 0000000000000..2a0c5985290e2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause NOGROUP in TASKLOOP construct
+subroutine omp_taskloop_nogroup()
+   integer x
+   x = 0
+   !$omp taskloop nogroup
+   do i = 1, 100
+      x = x + 1
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_nogroup
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
index 3ae9018ae4d5d..e8f69b470e2ca 100644
--- a/flang/test/Lower/OpenMP/if-clause.f90
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -12,7 +12,6 @@ program main
   ! - PARALLEL SECTIONS
   ! - PARALLEL WORKSHARE
   ! - TARGET UPDATE
-  ! - TASKLOOP
   ! - TASKLOOP SIMD
 
   ! ----------------------------------------------------------------------------
@@ -1580,4 +1579,29 @@ program main
   !$omp teams if(teams: .true.)
   i = 1
   !$omp end teams
+
+  ! ----------------------------------------------------------------------------
+  ! TASKLOOP
+  ! ----------------------------------------------------------------------------
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-NOT: if({{.*}})
+  !$omp taskloop
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp taskloop if(.true.)
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+
+  ! CHECK:      omp.taskloop
+  ! CHECK-SAME: if({{.*}})
+  !$omp taskloop if(taskloop: .true.)
+  do i = 1, 10
+  end do
+  !$omp end taskloop
 end program main
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index 0d2db63edfe79..9d01460253899 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -5,6 +5,36 @@
 
 ! Privatizers
 
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST3_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TASKLOOP_TEST3_X_FIRSTPRIVATE:.*]] : i32
+! CHECK-SAME:  copy {
+! CHECK:         hlfir.assign
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST2_X_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST2_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST1_I_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TASKLOOP_TEST1_X_FIRSTPRIVATE:.*]] : i32
+! CHECK-SAME:  copy {
+! CHECK:         hlfir.assign
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = private} @[[TASKLOOP_TEST1_Y_PRIVATE:.*]] : i32
+! CHECK-NOT:   copy {
+
 ! CHECK-LABEL: omp.private
 ! CHECK-SAME:      {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32
 ! CHECK-SAME:  copy {
@@ -310,4 +340,100 @@ subroutine implicit_dsa_test7
   !$omp end task
 end subroutine
 
-! TODO Test taskloop
+! Test taskloop
+! CHECK-LABEL:   func.func @_QPimplicit_dsa_taskloop_test1
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"}
+! CHECK:           %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"}
+! CHECK:           %[[DECL_Y:.*]]:2 = hlfir.declare %[[ALLOCA_Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"}
+! CHECK:           %[[DECL_Z:.*]]:2 = hlfir.declare %[[ALLOCA_Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine implicit_dsa_taskloop_test1
+   integer :: x, y, z
+   ! CHECK: omp.taskloop private(
+   ! CHECK-SAME: @[[TASKLOOP_TEST1_Y_PRIVATE]] %[[DECL_Y]]#0 -> %[[ARG0:.*]], @[[TASKLOOP_TEST1_X_FIRSTPRIVATE]] %[[DECL_X]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) {
+   ! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
+   !$omp taskloop private(y) shared(z)
+   do i = 1, 100
+      ! CHECK: %[[Y_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32>
+      x = y + z
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+
+   ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop default(shared)
+   do i = 1, 100
+      ! CHECK:  %[[LOAD_Y:.*]] = fir.load %[[DECL_Y]]#0 : !fir.ref<i32>
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[LOAD_Y]], %[[LOAD_Z]] : i32
+      x = y + z
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine
+
+! Nested taskloop with implicit shared DSA variables.
+! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test2
+! CHECK:          %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"}
+! CHECK:          %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"}
+! CHECK:          %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine implicit_dsa_taskloop_test2
+   integer :: x
+   ! CHECK:   omp.parallel {
+   !$omp parallel 
+   ! CHECK:   omp.taskloop private(@[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop
+   do i = 1, 100
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+      x = 2
+   end do
+   !$omp end taskloop
+
+   ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_X_PRIVATE]] %[[X_DECL]]#0 -> %[[ARG0]], @[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop private(x)
+   do i = 1, 10
+      ! CHECK: %[[DECL_PRIV_X:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_PRIV_X]]#0 : !fir.ref<i32>
+      x = x + 1
+      ! CHECK: hlfir.assign %{{.*}} to %[[DECL_PRIV_X]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end parallel
+
+end subroutine
+
+! Taskloop with implicit firstprivate DSA variables, enclosed in private context.
+
+! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test3
+! CHECK:          %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"}
+! CHECK:          %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"}
+! CHECK:          %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"}
+! CHECK:          %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"}
+! CHECK:          %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+subroutine implicit_dsa_taskloop_test3
+   integer :: x, y, z
+   ! CHECK:  omp.parallel private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   ! CHECK:  %[[X_PRIV_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !$omp parallel firstprivate(x)
+   ! CHECK:  omp.taskloop private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_PRIV_VAL]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST3_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG2:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop
+   ! CHECK:  %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   do i = 1, 100
+      ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+      ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<i32>
+      x = y + z
+      ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+   !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/taskloop.f90 b/flang/test/Lower/OpenMP/taskloop.f90
index 79b0c20e176c0..4a06e4def0c83 100644
--- a/flang/test/Lower/OpenMP/taskloop.f90
+++ b/flang/test/Lower/OpenMP/taskloop.f90
@@ -1,5 +1,27 @@
-! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! REQUIRES: openmp_runtime
+! RUN: bbc -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[OMP_TASKLOOP_UNTIEDEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFTEST_PRIORITYEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFTEST_MERGEABLEEI_PRIVATE_I32:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_IF_TEST1:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_FINAL:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[I_PRIVATE_TEST_ALLOCATE:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[X_PRIVATE_TEST_ALLOCATE:.*]] : i32
 
 ! CHECK-LABEL:  omp.private 
 ! CHECK-SAME:       {type = private} @[[I_PRIVATE_TEST2:.*]] : i32
@@ -70,3 +92,106 @@ subroutine omp_taskloop_private
 ! CHECK:         }
   !$omp end taskloop
 end subroutine omp_taskloop_private
+
+!===============================================================================
+! `allocate` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtaskloop_allocate
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_allocateEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_allocateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtaskloop_allocateEx"}
+! CHECK:           %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFtaskloop_allocateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine taskloop_allocate()
+   use omp_lib
+   integer :: x
+   ! CHECK:           omp.taskloop allocate(%{{.*}} : i64 -> %[[DECL_X]]#0 : !fir.ref<i32>) 
+   ! CHECK-SAME:      private(@[[X_PRIVATE_TEST_ALLOCATE]] %[[DECL_X]]#0 -> %[[ARG0:.*]], @[[I_PRIVATE_TEST_ALLOCATE]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop allocate(omp_high_bw_mem_alloc: x) private(x)
+   do i = 1, 100
+      ! CHECK: arith.addi
+      x = x + 12
+      ! CHECK: omp.yield
+   end do
+   !$omp end taskloop
+end subroutine taskloop_allocate
+
+!===============================================================================
+! `final` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtaskloop_final
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_finalEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_finalEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine taskloop_final()
+    ! CHECK:  omp.taskloop final(%true) private(@[[I_PRIVATE_FINAL]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) {
+   !$omp taskloop final(.true.)
+   do i = 1, 100
+      ! CHECK: fir.call @_QPfoo()
+      call foo()
+   end do
+   !$omp end taskloop
+end subroutine
+
+!===============================================================================
+! `if` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_if
+! CHECK:            %[[DECL_BAR:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}}
+! CHECK:           %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_ifEi"}
+! CHECK:           %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_ifEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[LOAD_VAL:.*]] = fir.load %[[DECL_BAR]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_BAR:.*]] = fir.convert %[[LOAD_VAL]] : (!fir.logical<4>) -> i1
+subroutine omp_taskloop_if(bar)
+   logical, intent(inout) :: bar
+   !CHECK: omp.taskloop if(%[[VAL_BAR]]) private(@[[I_PRIVATE_IF_TEST1]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   !$omp taskloop if(bar)
+   do i = 1, 10
+      call foo()
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_if
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtest_mergeable
+subroutine test_mergeable
+  ! CHECK: omp.taskloop mergeable
+  !$omp taskloop mergeable
+  do i = 1, 10
+  end do
+  !$omp end taskloop
+end subroutine test_mergeable
+
+!===============================================================================
+! `priority` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPtest_priority
+! CHECK:          %[[VAL1:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}}
+! CHECK:          %[[LOAD_VAL:.*]] = fir.load %[[VAL1]]#0 : !fir.ref<i32>
+subroutine test_priority(n)
+   integer, intent(inout) :: n
+   ! CHECK:  omp.taskloop priority(%[[LOAD_VAL]] : i32)
+   !$omp taskloop priority(n)
+   do i = 1, 10
+   end do
+   !$omp end taskloop
+end subroutine test_priority
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_untied
+subroutine omp_taskloop_untied()
+  ! CHECK: omp.taskloop untied
+  !$omp taskloop untied
+  do i = 1, 10
+    call foo()
+  end do
+  !$omp end taskloop
+end subroutine

From ba4babee6bab4ed049d1744b2eb951097e381a96 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 10 Nov 2025 20:29:00 -0800
Subject: [PATCH 62/97] [NFC][SpecialCaseList] Move most of implementation in
 cpp file (#167280)

This commit moves the `RegexMatcher`, `GlobMatcher`, `Matcher` and
`Section` classes into an anonymous namespace within
`SpecialCaseList.cpp`. These classes are implementation details of
`SpecialCaseList` and do not need to be exposed in the header.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 llvm/include/llvm/Support/SpecialCaseList.h |  95 +-----------
 llvm/lib/Support/SpecialCaseList.cpp        | 156 +++++++++++++++-----
 2 files changed, 128 insertions(+), 123 deletions(-)

diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index e92aeef2bfaae..5a012cf0c0264 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -12,19 +12,11 @@
 #ifndef LLVM_SUPPORT_SPECIALCASELIST_H
 #define LLVM_SUPPORT_SPECIALCASELIST_H
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/RadixTree.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/GlobPattern.h"
-#include "llvm/Support/Regex.h"
+#include "llvm/Support/Error.h"
 #include <memory>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 namespace llvm {
@@ -125,83 +117,11 @@ class SpecialCaseList {
   SpecialCaseList(SpecialCaseList const &) = delete;
   SpecialCaseList &operator=(SpecialCaseList const &) = delete;
 
-private:
-  using Match = std::pair<StringRef, unsigned>;
-  static constexpr Match NotMatched = {"", 0};
-
-  // Lagacy v1 matcher.
-  class RegexMatcher {
-  public:
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI Match match(StringRef Query) const;
-
-    struct Reg {
-      Reg(StringRef Name, unsigned LineNo, Regex &&Rg)
-          : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {}
-      StringRef Name;
-      unsigned LineNo;
-      Regex Rg;
-    };
-
-    std::vector<Reg> RegExes;
-  };
-
-  class GlobMatcher {
-  public:
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI Match match(StringRef Query) const;
-
-    struct Glob {
-      Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern)
-          : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {}
-      StringRef Name;
-      unsigned LineNo;
-      GlobPattern Pattern;
-    };
-
-    std::vector<GlobMatcher::Glob> Globs;
-
-    RadixTree<iterator_range<StringRef::const_iterator>,
-              RadixTree<iterator_range<StringRef::const_reverse_iterator>,
-                        SmallVector<int, 1>>>
-        PrefixSuffixToGlob;
-
-    RadixTree<iterator_range<StringRef::const_iterator>, SmallVector<int, 1>>
-        SubstrToGlob;
-  };
-
-  /// Represents a set of patterns and their line numbers
-  class Matcher {
-  public:
-    LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash);
-
-    LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-    LLVM_ABI void preprocess(bool BySize);
-
-    LLVM_ABI Match match(StringRef Query) const;
-
-    LLVM_ABI bool matchAny(StringRef Query) const {
-      return match(Query) != NotMatched;
-    }
-
-    std::variant<RegexMatcher, GlobMatcher> M;
-    bool RemoveDotSlash;
-  };
-
-  using SectionEntries = StringMap<StringMap<Matcher>>;
-
-protected:
   class Section {
   public:
-    Section(StringRef Name, unsigned FileIdx, bool UseGlobs)
-        : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), Name(Name),
-          FileIdx(FileIdx) {}
-
-    Section(Section &&) = default;
+    LLVM_ABI Section(StringRef Name, unsigned FileIdx, bool UseGlobs);
+    LLVM_ABI Section(Section &&);
+    LLVM_ABI ~Section();
 
     // Returns name of the section, its entire string in [].
     StringRef name() const { return Name; }
@@ -227,14 +147,11 @@ class SpecialCaseList {
 
   private:
     friend class SpecialCaseList;
-    LLVM_ABI void preprocess(bool OrderBySize);
-    LLVM_ABI const SpecialCaseList::Matcher *
-    findMatcher(StringRef Prefix, StringRef Category) const;
+    class SectionImpl;
 
-    Matcher SectionMatcher;
     StringRef Name;
-    SectionEntries Entries;
     unsigned FileIdx;
+    std::unique_ptr<SectionImpl> Impl;
   };
 
   ArrayRef<const Section> sections() const { return Sections; }
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 42c8933a43399..91f98cf7fac6c 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -14,26 +14,94 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/SpecialCaseList.h"
+#include "llvm/ADT/RadixTree.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <limits>
 #include <memory>
 #include <stdio.h>
 #include <string>
 #include <system_error>
 #include <utility>
+#include <variant>
+#include <vector>
 
 namespace llvm {
 
-Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
-                                            unsigned LineNumber) {
+namespace {
+
+using Match = std::pair<StringRef, unsigned>;
+static constexpr Match NotMatched = {"", 0};
+
+// Lagacy v1 matcher.
+class RegexMatcher {
+public:
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+
+  Match match(StringRef Query) const;
+
+  struct Reg {
+    Reg(StringRef Name, unsigned LineNo, Regex &&Rg)
+        : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {}
+    StringRef Name;
+    unsigned LineNo;
+    Regex Rg;
+  };
+
+  std::vector<Reg> RegExes;
+};
+
+class GlobMatcher {
+public:
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+
+  Match match(StringRef Query) const;
+
+  struct Glob {
+    Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern)
+        : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {}
+    StringRef Name;
+    unsigned LineNo;
+    GlobPattern Pattern;
+  };
+
+  std::vector<GlobMatcher::Glob> Globs;
+
+  RadixTree<iterator_range<StringRef::const_iterator>,
+            RadixTree<iterator_range<StringRef::const_reverse_iterator>,
+                      SmallVector<int, 1>>>
+      PrefixSuffixToGlob;
+
+  RadixTree<iterator_range<StringRef::const_iterator>, SmallVector<int, 1>>
+      SubstrToGlob;
+};
+
+/// Represents a set of patterns and their line numbers
+class Matcher {
+public:
+  Matcher(bool UseGlobs, bool RemoveDotSlash);
+
+  Error insert(StringRef Pattern, unsigned LineNumber);
+  void preprocess(bool BySize);
+  Match match(StringRef Query) const;
+
+  bool matchAny(StringRef Query) const { return match(Query).second > 0; }
+
+  std::variant<RegexMatcher, GlobMatcher> M;
+  bool RemoveDotSlash;
+};
+
+Error RegexMatcher::insert(StringRef Pattern, unsigned LineNumber) {
   if (Pattern.empty())
     return createStringError(errc::invalid_argument,
                              "Supplied regex was blank");
@@ -57,7 +125,7 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
-void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
+void RegexMatcher::preprocess(bool BySize) {
   if (BySize) {
     llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) {
       return A.Name.size() < B.Name.size();
@@ -65,16 +133,14 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
   }
 }
 
-SpecialCaseList::Match
-SpecialCaseList::RegexMatcher::match(StringRef Query) const {
+Match RegexMatcher::match(StringRef Query) const {
   for (const auto &R : reverse(RegExes))
     if (R.Rg.match(Query))
       return {R.Name, R.LineNo};
   return NotMatched;
 }
 
-Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
-                                           unsigned LineNumber) {
+Error GlobMatcher::insert(StringRef Pattern, unsigned LineNumber) {
   if (Pattern.empty())
     return createStringError(errc::invalid_argument, "Supplied glob was blank");
 
@@ -85,7 +151,7 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
   return Error::success();
 }
 
-void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
+void GlobMatcher::preprocess(bool BySize) {
   if (BySize) {
     llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) {
       return A.Name.size() < B.Name.size();
@@ -115,8 +181,7 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
   }
 }
 
-SpecialCaseList::Match
-SpecialCaseList::GlobMatcher::match(StringRef Query) const {
+Match GlobMatcher::match(StringRef Query) const {
   int Best = -1;
   if (!PrefixSuffixToGlob.empty()) {
     for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
@@ -164,7 +229,7 @@ SpecialCaseList::GlobMatcher::match(StringRef Query) const {
   return {Globs[Best].Name, Globs[Best].LineNo};
 }
 
-SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
+Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
     : RemoveDotSlash(RemoveDotSlash) {
   if (UseGlobs)
     M.emplace<GlobMatcher>();
@@ -172,20 +237,34 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
     M.emplace<RegexMatcher>();
 }
 
-Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
+Error Matcher::insert(StringRef Pattern, unsigned LineNumber) {
   return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
 }
 
-void SpecialCaseList::Matcher::preprocess(bool BySize) {
+void Matcher::preprocess(bool BySize) {
   return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
 }
 
-SpecialCaseList::Match SpecialCaseList::Matcher::match(StringRef Query) const {
+Match Matcher::match(StringRef Query) const {
   if (RemoveDotSlash)
     Query = llvm::sys::path::remove_leading_dotslash(Query);
-  return std::visit(
-      [&](auto &V) -> SpecialCaseList::Match { return V.match(Query); }, M);
+  return std::visit([&](auto &V) -> Match { return V.match(Query); }, M);
 }
+} // namespace
+
+class SpecialCaseList::Section::SectionImpl {
+public:
+  void preprocess(bool OrderBySize);
+  const Matcher *findMatcher(StringRef Prefix, StringRef Category) const;
+
+  using SectionEntries = StringMap<StringMap<Matcher>>;
+
+  explicit SectionImpl(bool UseGlobs)
+      : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false) {}
+
+  Matcher SectionMatcher;
+  SectionEntries Entries;
+};
 
 // TODO: Refactor this to return Expected<...>
 std::unique_ptr<SpecialCaseList>
@@ -247,7 +326,7 @@ SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
   Sections.emplace_back(SectionStr, FileNo, UseGlobs);
   auto &Section = Sections.back();
 
-  if (auto Err = Section.SectionMatcher.insert(SectionStr, LineNo)) {
+  if (auto Err = Section.Impl->SectionMatcher.insert(SectionStr, LineNo)) {
     return createStringError(errc::invalid_argument,
                              "malformed section at line " + Twine(LineNo) +
                                  ": '" + SectionStr +
@@ -279,7 +358,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
     Error = toString(std::move(Err));
     return false;
   }
-  Section *CurrentSection = ErrOrSection.get();
+  Section::SectionImpl *CurrentImpl = ErrOrSection.get()->Impl.get();
 
   // This is the current list of prefixes for all existing users matching file
   // path. We may need parametrization in constructor in future.
@@ -307,7 +386,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
         Error = toString(std::move(Err));
         return false;
       }
-      CurrentSection = ErrOrSection.get();
+      CurrentImpl = ErrOrSection.get()->Impl.get();
       continue;
     }
 
@@ -320,7 +399,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
     }
 
     auto [Pattern, Category] = Postfix.split("=");
-    auto [It, _] = CurrentSection->Entries[Prefix].try_emplace(
+    auto [It, _] = CurrentImpl->Entries[Prefix].try_emplace(
         Category, UseGlobs,
         RemoveDotSlash && llvm::is_contained(PathPrefixes, Prefix));
     Pattern = Pattern.copy(StrAlloc);
@@ -334,7 +413,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
   }
 
   for (Section &S : Sections)
-    S.preprocess(OrderBySize);
+    S.Impl->preprocess(OrderBySize);
 
   return true;
 }
@@ -351,7 +430,7 @@ std::pair<unsigned, unsigned>
 SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
                                 StringRef Query, StringRef Category) const {
   for (const auto &S : reverse(Sections)) {
-    if (S.SectionMatcher.matchAny(Section)) {
+    if (S.Impl->SectionMatcher.matchAny(Section)) {
       unsigned Blame = S.getLastMatch(Prefix, Query, Category);
       if (Blame)
         return {S.FileIdx, Blame};
@@ -360,13 +439,22 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
   return NotFound;
 }
 
+SpecialCaseList::Section::Section(StringRef Str, unsigned FileIdx,
+                                  bool UseGlobs)
+    : Name(Str), FileIdx(FileIdx),
+      Impl(std::make_unique<SectionImpl>(UseGlobs)) {}
+
+SpecialCaseList::Section::Section(Section &&) = default;
+
+SpecialCaseList::Section::~Section() = default;
+
 bool SpecialCaseList::Section::matchName(StringRef Name) const {
-  return SectionMatcher.matchAny(Name);
+  return Impl->SectionMatcher.matchAny(Name);
 }
 
-const SpecialCaseList::Matcher *
-SpecialCaseList::Section::findMatcher(StringRef Prefix,
-                                      StringRef Category) const {
+const Matcher *
+SpecialCaseList::Section::SectionImpl::findMatcher(StringRef Prefix,
+                                                   StringRef Category) const {
   SectionEntries::const_iterator I = Entries.find(Prefix);
   if (I == Entries.end())
     return nullptr;
@@ -377,7 +465,7 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix,
   return &II->second;
 }
 
-LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
+void SpecialCaseList::Section::SectionImpl::preprocess(bool OrderBySize) {
   SectionMatcher.preprocess(false);
   for (auto &[K1, E] : Entries)
     for (auto &[K2, M] : E)
@@ -387,7 +475,7 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
 unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
                                                 StringRef Query,
                                                 StringRef Category) const {
-  if (const Matcher *M = findMatcher(Prefix, Category))
+  if (const Matcher *M = Impl->findMatcher(Prefix, Category))
     return M->match(Query).second;
   return 0;
 }
@@ -395,13 +483,13 @@ unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
 StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix,
                                                     StringRef Query,
                                                     StringRef Category) const {
-  if (const Matcher *M = findMatcher(Prefix, Category))
+  if (const Matcher *M = Impl->findMatcher(Prefix, Category))
     return M->match(Query).first;
   return {};
 }
 
 bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const {
-  return Entries.find(Prefix) != Entries.end();
+  return Impl->Entries.find(Prefix) != Impl->Entries.end();
 }
 
 } // namespace llvm

From 242d0c770ca31c8e8579239a09881fb2d2ceec18 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 20:48:30 -0800
Subject: [PATCH 63/97] SplitKit: Use initializer lists (#167449)

---
 llvm/lib/CodeGen/SplitKit.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f9ecb2c97b2e0..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -1509,10 +1509,9 @@ void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
   }
 
   // Trace value through phis.
-  SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
-  SmallVector<const VNInfo *, 4> WorkList;
-  Visited.insert(&ParentVNI);
-  WorkList.push_back(&ParentVNI);
+  ///< whether VNI was/is in worklist.
+  SmallPtrSet<const VNInfo *, 8> Visited = {&ParentVNI};
+  SmallVector<const VNInfo *, 4> WorkList = {&ParentVNI};
 
   const LiveInterval &ParentLI = Edit->getParent();
   const SlotIndexes &Indexes = *LIS.getSlotIndexes();

From 0c098696f892ed88e2fa037791b86651da940acf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 10 Nov 2025 21:35:18 -0800
Subject: [PATCH 64/97] ARC,M68k: Adapt #158240

---
 llvm/lib/Target/ARC/ARCInstrInfo.cpp   | 12 +++++-------
 llvm/lib/Target/ARC/ARCInstrInfo.h     |  6 ++----
 llvm/lib/Target/M68k/M68kInstrInfo.cpp | 14 ++++++--------
 llvm/lib/Target/M68k/M68kInstrInfo.h   |  6 ++----
 4 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 2dec6ffd89a75..e17ecbf87faae 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -294,8 +294,7 @@ void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void ARCInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg,
-    bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
@@ -307,11 +306,11 @@ void ARCInstrInfo::storeRegToStackSlot(
       MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
-  assert(TRI->getSpillSize(*RC) == 4 &&
+  assert(TRI.getSpillSize(*RC) == 4 &&
          "Only support 4-byte stores to stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI)
+  LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, &TRI)
                     << " to FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, DL, get(ARC::ST_rs9))
       .addReg(SrcReg, getKillRegState(IsKill))
@@ -324,7 +323,6 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL = MBB.findDebugLoc(I);
@@ -336,11 +334,11 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
-  assert(TRI->getSpillSize(*RC) == 4 &&
+  assert(TRI.getSpillSize(*RC) == 4 &&
          "Only support 4-byte loads from stack now.");
   assert(ARC::GPR32RegClass.hasSubClassEq(RC) &&
          "Only support GPR32 stores to stack now.");
-  LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI)
+  LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, &TRI)
                     << " from FrameIndex=" << FrameIndex << "\n");
   BuildMI(MBB, I, DL, get(ARC::LD_rs9))
       .addReg(DestReg, RegState::Define)
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h
index 2cf05ba57bd4b..ebeaf877f8436 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.h
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.h
@@ -70,14 +70,12 @@ class ARCInstrInfo : public ARCGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index c6be190bd1245..91077ff5961a4 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -43,7 +43,7 @@ using namespace llvm;
 void M68kInstrInfo::anchor() {}
 
 M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI)
-    : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
+    : M68kGenInstrInfo(STI, RI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0,
                        M68k::RET),
       Subtarget(STI), RI(STI) {}
 
@@ -838,15 +838,14 @@ bool M68kInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
 void M68kInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-    bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-    const TargetRegisterInfo *TRI, Register VReg,
+    bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
     MachineInstr::MIFlag Flags) const {
   const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-  assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) &&
          "Stack slot is too small to store");
   (void)MFI;
 
-  unsigned Opc = getStoreRegOpcode(SrcReg, RC, TRI, Subtarget);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, &TRI, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   // (0,FrameIndex) <- $reg
   M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIndex)
@@ -857,15 +856,14 @@ void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                          Register DstReg, int FrameIndex,
                                          const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI,
                                          Register VReg,
                                          MachineInstr::MIFlag Flags) const {
   const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-  assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) &&
+  assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) &&
          "Stack slot is too small to load");
   (void)MFI;
 
-  unsigned Opc = getLoadRegOpcode(DstReg, RC, TRI, Subtarget);
+  unsigned Opc = getLoadRegOpcode(DstReg, RC, &TRI, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DstReg), FrameIndex);
 }
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index 97615d60caa0b..2b3789d768602 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -280,14 +280,12 @@ class M68kInstrInfo : public M68kGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;

From 9524b37c503f93528bd96395228cad8c800ab7a3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 10 Nov 2025 21:48:34 -0800
Subject: [PATCH 65/97] [GitHub][CI] Add parallel to the ci-container-abi-tests
 container (#167432)

This way we can use the container for the libclang-abi-tests too.
---
 .github/workflows/containers/github-action-ci-tooling/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
index be61264b93753..b78c99efb9be3 100644
--- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
@@ -108,6 +108,7 @@ RUN apt-get update && \
     abi-compliance-checker \
     abi-dumper \
     autoconf \
+    parallel \
     pkg-config && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*

From 00d2780cca3438c3f2ce56a2f9d5549bbcb67131 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 10 Nov 2025 21:55:03 -0800
Subject: [PATCH 66/97] workflows/llvm-abi-tests: Use new container (#167424)

This will speed up the tests by using a faster compiler and also
eliminating extra package install steps.
---
 .github/workflows/llvm-abi-tests.yml | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index 1ea134382ef6f..f75dd9c3abd9e 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -72,6 +72,8 @@ jobs:
     if: github.repository_owner == 'llvm'
     needs: abi-dump-setup
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b"
     strategy:
       matrix:
         name:
@@ -87,19 +89,6 @@ jobs:
             ref: ${{ github.sha }}
             repo: ${{ github.repository }}
     steps:
-      - name: Install Ninja
-        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-dumper autoconf pkg-config
-      - name: Install universal-ctags
-        run: |
-          git clone https://github.com/universal-ctags/ctags.git
-          cd ctags
-          ./autogen.sh
-          ./configure
-          sudo make install
       - name: Download source code
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
@@ -143,6 +132,8 @@ jobs:
   abi-compare:
     if: github.repository_owner == 'llvm'
     runs-on: ubuntu-24.04
+    container:
+      image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b
     needs:
       - abi-dump-setup
       - abi-dump
@@ -163,10 +154,6 @@ jobs:
           name: symbol-list
           path: symbol-list
 
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-compliance-checker
       - name: Compare ABI
         run: |
           if [ -s symbol-list/llvm.symbols ]; then

From 8475a66d4376344d2f3b330f351fe651eaf558dd Mon Sep 17 00:00:00 2001
From: hidekisaito <hidekido@amd.com>
Date: Mon, 10 Nov 2025 22:06:45 -0800
Subject: [PATCH 67/97] [AMDGPU] Correct validation for the intended behavior
 of the test (#167411)

---
 llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index c552f9d283597..88a51e9ccf04c 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -1,10 +1,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s
 
+; Rematerialization test for fp64 constants (w/ intentionally high register pressure).
+; Check to make sure we have at least six constant MOVs, not necessarily consecutive, inside the loop.
+
 ; GCN-LABEL: {{^}}test_remat_sgpr:
 ; GCN-NOT:     v_writelane_b32
-; GCN-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x
 ; GCN:         {{^}}[[LOOP:.LBB[0-9_]+]]:
+; GCN-COUNT-6: {{s_mov_b32|v_mov_b32_e32}} {{[sv]}}{{[0-9]+}}, 0x
 ; GCN-NOT:     v_writelane_b32
 ; GCN:         s_cbranch_{{[^ ]+}} [[LOOP]]
 ; GCN: .sgpr_spill_count: 0

From 356b0ddc6cb02c405632d40433b2706eccaf2410 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Tue, 11 Nov 2025 01:13:33 -0500
Subject: [PATCH 68/97] [AMDGPU][CodeGen] enable D16Writes32BitVgpr for gfx12
 (#165587)

The issue in https://github.com/llvm/llvm-project/pull/157795 also
reproducible for gfx12.

This should only be used in true16 mode. Although gfx12 not yet has
true16 mode enable, there could be downstream branch to test true16 mode
manually thus enable this workaround now
---
 llvm/lib/Target/AMDGPU/AMDGPU.td     |   2 +
 llvm/test/CodeGen/AMDGPU/spillv16.ll | 204 ++++++++++++++++++++++++++-
 2 files changed, 204 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..0b61adf409948 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2069,6 +2069,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureMemoryAtomicFAddF32DenormalSupport,
    FeatureBVHDualAndBVH8Insts,
    FeatureWaitsBeforeSystemScopeStores,
+   FeatureD16Writes32BitVgpr
    ]>;
 
 def FeatureISAVersion12_50 : FeatureSet<
@@ -2143,6 +2144,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureSupportsXNACK,
    FeatureXNACK,
    FeatureClusters,
+   FeatureD16Writes32BitVgpr,
 ]>;
 
 def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 2d54ac8283a3a..9686c9d30b97c 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,-d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16
 
@@ -35,6 +37,26 @@ define void @spill_i16_alu() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16_alu:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_i16_alu:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -126,6 +148,56 @@ define void @spill_i16_alu_two_vals() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W32:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W32-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W32-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals:
+; GFX12-TRUE16-D16W16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-D16W16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-D16W16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-D16W16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-D16W16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -223,6 +295,25 @@ define void @spill_i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -282,6 +373,25 @@ define void @spill_half() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_half:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_half:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -341,6 +451,25 @@ define void @spill_i16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_i16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_i16_from_v2i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -414,13 +543,39 @@ define void @spill_2xi16_from_v2i16() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_clause 0x1
+; GFX1250-TRUE16-NEXT:    s_clause 0x1 ; 4-byte Folded Spill
 ; GFX1250-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12
 ; GFX1250-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -444,7 +599,7 @@ define void @spill_2xi16_from_v2i16() {
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_clause 0x1
+; GFX1250-FAKE16-NEXT:    s_clause 0x1 ; 4-byte Folded Spill
 ; GFX1250-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8
 ; GFX1250-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
@@ -520,6 +675,32 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
 ; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
 ; GFX1250-TRUE16:       ; %bb.0: ; %entry
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -595,6 +776,25 @@ define void @spill_v2i16() {
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-TRUE16-LABEL: spill_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX12-TRUE16-NEXT:    ;;#ASMSTART
+; GFX12-TRUE16-NEXT:    ;;#ASMEND
+; GFX12-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: spill_v2i16:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0

From 1b4f226ad951f1c163c57a37aac1c97e446b2591 Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Tue, 11 Nov 2025 14:14:37 +0800
Subject: [PATCH 69/97] [clang-tidy] Add `IgnoreValueCalls` option to
 bugprone-unchecked-optional-access (#167209)

Add a new option `IgnoreValueCalls` to
`bugprone-unchecked-optional-access`

Closes [#163831](https://github.com/llvm/llvm-project/issues/163831)
---
 .../bugprone/UncheckedOptionalAccessCheck.h   |  4 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |  6 ++-
 .../bugprone/unchecked-optional-access.rst    | 19 ++++++++
 ...unchecked-optional-access-ignore-value.cpp | 25 +++++++++++
 .../Models/UncheckedOptionalAccessModel.h     |  3 ++
 .../Models/UncheckedOptionalAccessModel.cpp   | 44 +++++++++++--------
 6 files changed, 81 insertions(+), 20 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
index 11086fb4bfda1..62bf42da4f9f9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h
@@ -25,7 +25,8 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck {
 public:
   UncheckedOptionalAccessCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context),
-        ModelOptions{Options.get("IgnoreSmartPointerDereference", false)} {}
+        ModelOptions{Options.get("IgnoreSmartPointerDereference", false),
+                     Options.get("IgnoreValueCalls", false)} {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
@@ -34,6 +35,7 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck {
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override {
     Options.store(Opts, "IgnoreSmartPointerDereference",
                   ModelOptions.IgnoreSmartPointerDereference);
+    Options.store(Opts, "IgnoreValueCalls", ModelOptions.IgnoreValueCalls);
   }
 
 private:
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 2628a26acdf5e..4283fe0b5ea69 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -367,7 +367,11 @@ Changes in existing checks
 - Improved :doc:`bugprone-unchecked-optional-access
   <clang-tidy/checks/bugprone/unchecked-optional-access>` check by supporting
   ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to
-  prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type.
+  prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type, and by
+  adding the `IgnoreValueCalls` option to suppress diagnostics for
+  ``optional::value()`` and the `IgnoreSmartPointerDereference` option to
+  ignore optionals reached via smart-pointer-like dereference, while still
+  diagnosing UB-prone dereferences via ``operator*`` and ``operator->``.
 
 - Improved :doc:`bugprone-unhandled-self-assignment
   <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by adding
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
index 552e6db699696..ebed79e339d4b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst
@@ -308,3 +308,22 @@ advantages:
   * Performance. A single check can cover many or even all accesses within
     scope. This gives the user the best of both worlds -- the safety of a
     dynamic check, but without incurring redundant costs.
+
+Options
+-------
+
+.. option:: IgnoreSmartPointerDereference
+
+   If set to `true`, the check ignores optionals that
+   are reached through overloaded smart-pointer-like dereference (``operator*``,
+   ``operator->``) on classes other than the optional type itself. This helps
+   avoid false positives where the analysis cannot equate results across such
+   calls. This does not cover access through ``operator[]``. Default is `false`.
+
+.. option:: IgnoreValueCalls
+
+   If set to `true`, the check does not diagnose calls
+   to ``optional::value()``. Diagnostics for ``operator*()`` and
+   ``operator->()`` remain enabled. This is useful for codebases that
+   intentionally rely on ``value()`` for defined, guarded access while still
+   flagging UB-prone operator dereferences. Default is `false`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp
new file mode 100644
index 0000000000000..f54621269f8c0
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp
@@ -0,0 +1,25 @@
+// RUN: %check_clang_tidy %s bugprone-unchecked-optional-access %t -- \
+// RUN:   -config="{CheckOptions:  \
+// RUN:     {bugprone-unchecked-optional-access.IgnoreValueCalls: true}}" -- \
+// RUN:   -I %S/Inputs/unchecked-optional-access
+
+#include "absl/types/optional.h"
+
+struct Foo {
+  void foo() const {}
+};
+
+void unchecked_value_access(const absl::optional<int> &opt) {
+  opt.value(); // no-warning
+}
+
+void unchecked_deref_operator_access(const absl::optional<int> &opt) {
+  *opt;
+  // CHECK-MESSAGES: :[[@LINE-1]]:4: warning: unchecked access to optional value
+}
+
+void unchecked_arrow_operator_access(const absl::optional<Foo> &opt) {
+  opt->foo();
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value
+}
+
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
index 696c9f4a6cf5c..c547d6ce2e387 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
@@ -46,6 +46,9 @@ struct UncheckedOptionalAccessModelOptions {
   /// are confident in this const accessor caching, we shouldn't need the
   /// IgnoreSmartPointerDereference option anymore.
   bool IgnoreSmartPointerDereference = false;
+
+  /// In generating diagnostics, ignore calls to `optional::value()`.
+  bool IgnoreValueCalls = false;
 };
 
 using UncheckedOptionalAccessLattice = CachedConstAccessorsLattice<NoopLattice>;
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 0fa333eedcfdd..d90f5d4eaf7bb 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -1153,26 +1153,34 @@ auto buildDiagnoseMatchSwitch(
   // FIXME: Evaluate the efficiency of matchers. If using matchers results in a
   // lot of duplicated work (e.g. string comparisons), consider providing APIs
   // that avoid it through memoization.
-  auto IgnorableOptional = ignorableOptional(Options);
-  return CFGMatchSwitchBuilder<
-             const Environment,
-             llvm::SmallVector<UncheckedOptionalAccessDiagnostic>>()
-      // optional::value
-      .CaseOfCFGStmt<CXXMemberCallExpr>(
-          valueCall(IgnorableOptional),
-          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &,
-             const Environment &Env) {
-            return diagnoseUnwrapCall(E->getImplicitObjectArgument(), Env);
-          })
-
-      // optional::operator*, optional::operator->
-      .CaseOfCFGStmt<CallExpr>(valueOperatorCall(IgnorableOptional),
-                               [](const CallExpr *E,
+  const auto IgnorableOptional = ignorableOptional(Options);
+
+  auto DiagBuilder =
+      CFGMatchSwitchBuilder<
+          const Environment,
+          llvm::SmallVector<UncheckedOptionalAccessDiagnostic>>()
+          // optional::operator*, optional::operator->
+          .CaseOfCFGStmt<CallExpr>(
+              valueOperatorCall(IgnorableOptional),
+              [](const CallExpr *E, const MatchFinder::MatchResult &,
+                 const Environment &Env) {
+                return diagnoseUnwrapCall(E->getArg(0), Env);
+              });
+
+  auto Builder = Options.IgnoreValueCalls
+                     ? std::move(DiagBuilder)
+                     : std::move(DiagBuilder)
+                           // optional::value
+                           .CaseOfCFGStmt<CXXMemberCallExpr>(
+                               valueCall(IgnorableOptional),
+                               [](const CXXMemberCallExpr *E,
                                   const MatchFinder::MatchResult &,
                                   const Environment &Env) {
-                                 return diagnoseUnwrapCall(E->getArg(0), Env);
-                               })
-      .Build();
+                                 return diagnoseUnwrapCall(
+                                     E->getImplicitObjectArgument(), Env);
+                               });
+
+  return std::move(Builder).Build();
 }
 
 } // namespace

From 957f929eba875e4aeb25c6d8340575a6edb8588d Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Tue, 11 Nov 2025 14:16:05 +0800
Subject: [PATCH 70/97] [RISCV] Set
 __GCC_CONSTRUCTIVE_SIZE/__GCC_DESTRUCTIVE_SIZE to 64 (#162986)

These two macros were added in
https://github.com/llvm/llvm-project/pull/89446.

But the previous values may not be reasonable for RV64 systems because
most
of them have a cache line size 64B. So here we change them to 64.
---
 clang/docs/ReleaseNotes.rst          |  3 +++
 clang/lib/Basic/Targets/RISCV.h      |  2 +-
 clang/test/Preprocessor/init-riscv.c | 10 ++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Preprocessor/init-riscv.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c43f236aab319..980dbf1ff2cf6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -598,6 +598,9 @@ RISC-V Support
 - Add `-march=unset` to clear any previous `-march=` value. This ISA string will
   be computed from `-mcpu` or the platform default.
 
+- `__GCC_CONSTRUCTIVE_SIZE` and `__GCC_DESTRUCTIVE_SIZE` are changed to 64. These values are
+  unstable according to `Clang's documentation <https://clang.llvm.org/docs/LanguageExtensions.html#gcc-destructive-size-and-gcc-constructive-size>`_.
+
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 85fa4cc07dccf..21555b94fe65d 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -126,7 +126,7 @@ class RISCVTargetInfo : public TargetInfo {
   llvm::APInt getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
-    return std::make_pair(32, 32);
+    return std::make_pair(64, 64);
   }
 
   bool supportsCpuSupports() const override { return getTriple().isOSLinux(); }
diff --git a/clang/test/Preprocessor/init-riscv.c b/clang/test/Preprocessor/init-riscv.c
new file mode 100644
index 0000000000000..4eeecccff4378
--- /dev/null
+++ b/clang/test/Preprocessor/init-riscv.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -E -dM -triple=riscv32 < /dev/null | \
+// RUN:     FileCheck -match-full-lines -check-prefixes=RV32 %s
+// RUN: %clang_cc1 -E -dM -triple=riscv64 < /dev/null | \
+// RUN:     FileCheck -match-full-lines -check-prefixes=RV64 %s
+
+// RV32: #define __GCC_CONSTRUCTIVE_SIZE 64
+// RV32: #define __GCC_DESTRUCTIVE_SIZE 64
+
+// RV64: #define __GCC_CONSTRUCTIVE_SIZE 64
+// RV64: #define __GCC_DESTRUCTIVE_SIZE 64

From f2aad35caff5b316839d57f8003a7060c6c06612 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 10 Nov 2025 22:52:57 -0800
Subject: [PATCH 71/97] [XCore] Use MCRegister instead of unsigned. NFC
 (#167461)

---
 llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 096ad08d8a3c9..0e00db495256c 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -69,7 +69,7 @@ static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
   return true;
 }
 
-static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
+static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) {
   const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo();
   return RegInfo->getRegClass(RC).getRegister(RegNo);
 }
@@ -79,7 +79,7 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                               const MCDisassembler *Decoder) {
   if (RegNo > 11)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }
@@ -89,7 +89,7 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                              const MCDisassembler *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+  MCRegister Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }

From 4c4b1a905e89278d5d02b8d5ba5e2a948d638a88 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 11 Nov 2025 18:11:41 +1100
Subject: [PATCH 72/97] [orc-rt] Replace wrapper fn `void *CallCtx` arg with
 `uint64_t CallId`. (#167452)

This argument serves as an opaque id (outside the ControllerAccess
object) for a call to a wrapper function. I expect that most
ControllerAccess implementations will want to use this argument as a
sequence number (plain integer), for which uint64_t will be a better fit
than void*. For ControllerAccess implementations that want to use a
pointer, uint64_t should be sufficiently large.
---
 orc-rt/include/orc-rt-c/WrapperFunction.h     |  7 +--
 orc-rt/include/orc-rt/SPSWrapperFunction.h    |  4 +-
 orc-rt/include/orc-rt/SimpleNativeMemoryMap.h |  8 ++--
 orc-rt/include/orc-rt/WrapperFunction.h       | 28 ++++++------
 orc-rt/lib/executor/SimpleNativeMemoryMap.cpp | 16 +++----
 orc-rt/unittests/DirectCaller.h               |  8 ++--
 orc-rt/unittests/SPSWrapperFunctionTest.cpp   | 43 ++++++++++---------
 7 files changed, 60 insertions(+), 54 deletions(-)

diff --git a/orc-rt/include/orc-rt-c/WrapperFunction.h b/orc-rt/include/orc-rt-c/WrapperFunction.h
index 280e513c9c0e6..fefdf03ff3f06 100644
--- a/orc-rt/include/orc-rt-c/WrapperFunction.h
+++ b/orc-rt/include/orc-rt-c/WrapperFunction.h
@@ -54,7 +54,7 @@ typedef struct {
  * Asynchronous return function for an orc-rt wrapper function.
  */
 typedef void (*orc_rt_WrapperFunctionReturn)(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionBuffer ResultBytes);
 
 /**
@@ -62,10 +62,11 @@ typedef void (*orc_rt_WrapperFunctionReturn)(
  *
  * ArgBytes contains the serialized arguments for the wrapper function.
  * Session holds a reference to the session object.
- * CallCtx holds a pointer to the context object for this particular call.
+ * CallId holds a pointer to the context object for this particular call.
  * Return holds a pointer to the return function.
  */
-typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session, void *CallCtx,
+typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes);
 
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 46c08a0c688d0..1ae2c130dc0cf 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -124,10 +124,10 @@ template <typename SPSSig> struct SPSWrapperFunction {
   }
 
   template <typename Handler>
-  static void handle(orc_rt_SessionRef Session, void *CallCtx,
+  static void handle(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionReturn Return,
                      WrapperFunctionBuffer ArgBytes, Handler &&H) {
-    WrapperFunction::handle(Session, CallCtx, Return, std::move(ArgBytes),
+    WrapperFunction::handle(Session, CallId, Return, std::move(ArgBytes),
                             WrapperFunctionSPSSerializer<SPSSig>(),
                             std::forward<Handler>(H));
   }
diff --git a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
index cf0e4ac732ca0..20b080e960dea 100644
--- a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
+++ b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
@@ -114,21 +114,21 @@ class SimpleNativeMemoryMap : public ResourceManager {
 } // namespace orc_rt
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes);
 
 #endif // ORC_RT_SIMPLENATIVEMEMORYMAP_H
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 6e8b84e980dc0..494e2a0dc0bb1 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -137,14 +137,14 @@ using WFHandlerTraits = CallableTraitsHelper<WFHandlerTraitsImpl, C>;
 
 template <typename Serializer> class StructuredYieldBase {
 public:
-  StructuredYieldBase(orc_rt_SessionRef Session, void *CallCtx,
+  StructuredYieldBase(orc_rt_SessionRef Session, uint64_t CallId,
                       orc_rt_WrapperFunctionReturn Return, Serializer &&S)
-      : Session(Session), CallCtx(CallCtx), Return(Return),
+      : Session(Session), CallId(CallId), Return(Return),
         S(std::forward<Serializer>(S)) {}
 
 protected:
   orc_rt_SessionRef Session;
-  void *CallCtx;
+  uint64_t CallId;
   orc_rt_WrapperFunctionReturn Return;
   std::decay_t<Serializer> S;
 };
@@ -158,9 +158,9 @@ class StructuredYield<std::tuple<RetT>, Serializer>
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()(RetT &&R) {
     if (auto ResultBytes = this->S.result().serialize(std::forward<RetT>(R)))
-      this->Return(this->Session, this->CallCtx, ResultBytes->release());
+      this->Return(this->Session, this->CallId, ResultBytes->release());
     else
-      this->Return(this->Session, this->CallCtx,
+      this->Return(this->Session, this->CallId,
                    WrapperFunctionBuffer::createOutOfBandError(
                        "Could not serialize wrapper function result data")
                        .release());
@@ -173,7 +173,7 @@ class StructuredYield<std::tuple<>, Serializer>
 public:
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()() {
-    this->Return(this->Session, this->CallCtx,
+    this->Return(this->Session, this->CallId,
                  WrapperFunctionBuffer().release());
   }
 };
@@ -251,12 +251,12 @@ struct WrapperFunction {
   ///
   ///
   ///   static void adder_add_async_sps_wrapper(
-  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_SessionRef Session, uint64_t CallId,
   ///       orc_rt_WrapperFunctionReturn Return,
   ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
   ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
   ///     SPSWrapperFunction<SPSSig>::handle(
-  ///         Session, CallCtx, Return, ArgBytes,
+  ///         Session, CallId, Return, ArgBytes,
   ///         WrapperFunction::handleWithAsyncMethod(&MyClass::myMethod));
   ///   }
   ///   @endcode
@@ -313,12 +313,12 @@ struct WrapperFunction {
   ///
   ///
   ///   static void adder_add_sync_sps_wrapper(
-  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_SessionRef Session, uint64_t CallId,
   ///       orc_rt_WrapperFunctionReturn Return,
   ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
   ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
   ///     SPSWrapperFunction<SPSSig>::handle(
-  ///         Session, CallCtx, Return, ArgBytes,
+  ///         Session, CallId, Return, ArgBytes,
   ///         WrapperFunction::handleWithSyncMethod(&Adder::addSync));
   ///   }
   ///   @endcode
@@ -368,7 +368,7 @@ struct WrapperFunction {
   /// This utility deserializes and serializes arguments and return values
   /// (using the given Serializer), and calls the given handler.
   template <typename Serializer, typename Handler>
-  static void handle(orc_rt_SessionRef Session, void *CallCtx,
+  static void handle(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionReturn Return,
                      WrapperFunctionBuffer ArgBytes, Serializer &&S,
                      Handler &&H) {
@@ -380,16 +380,16 @@ struct WrapperFunction {
     typedef typename CallableArgInfo<Yield>::args_tuple_type RetTupleType;
 
     if (ArgBytes.getOutOfBandError())
-      return Return(Session, CallCtx, ArgBytes.release());
+      return Return(Session, CallId, ArgBytes.release());
 
     if (auto Args = S.arguments().template deserialize<ArgTuple>(ArgBytes))
       std::apply(HandlerTraits::forwardArgsAsRequested(bind_front(
                      std::forward<Handler>(H),
                      detail::StructuredYield<RetTupleType, Serializer>(
-                         Session, CallCtx, Return, std::move(S)))),
+                         Session, CallId, Return, std::move(S)))),
                  *Args);
     else
-      Return(Session, CallCtx,
+      Return(Session, CallId,
              WrapperFunctionBuffer::createOutOfBandError(
                  "Could not deserialize wrapper function arg data")
                  .release());
diff --git a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
index 5d410acbc65d9..c6a005d6a70e6 100644
--- a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
+++ b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
@@ -367,45 +367,45 @@ Error SimpleNativeMemoryMap::recordDeallocActions(
 }
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize);
   SPSWrapperFunction<Sig>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::reserve));
 }
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<Sig>::handle(Session, CallCtx, Return, ArgBytes,
+  SPSWrapperFunction<Sig>::handle(Session, CallId, Return, ArgBytes,
                                   WrapperFunction::handleWithAsyncMethod(
                                       &SimpleNativeMemoryMap::releaseMultiple));
 }
 
 ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSExpected<SPSExecutorAddr>(
       SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest);
-  SPSWrapperFunction<Sig>::handle(Session, CallCtx, Return, ArgBytes,
+  SPSWrapperFunction<Sig>::handle(Session, CallId, Return, ArgBytes,
                                   WrapperFunction::handleWithAsyncMethod(
                                       &SimpleNativeMemoryMap::initialize));
 }
 
 ORC_RT_SPS_INTERFACE void
 orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef Session, void *CallCtx,
+    orc_rt_SessionRef Session, uint64_t CallId,
     orc_rt_WrapperFunctionReturn Return,
     orc_rt_WrapperFunctionBuffer ArgBytes) {
   using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
   SPSWrapperFunction<Sig>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(
           &SimpleNativeMemoryMap::deinitializeMultiple));
 }
diff --git a/orc-rt/unittests/DirectCaller.h b/orc-rt/unittests/DirectCaller.h
index 7c5c9d300d882..fab006717c157 100644
--- a/orc-rt/unittests/DirectCaller.h
+++ b/orc-rt/unittests/DirectCaller.h
@@ -22,10 +22,11 @@ class DirectCaller {
     virtual ~DirectResultSender() {}
     virtual void send(orc_rt_SessionRef Session,
                       orc_rt::WrapperFunctionBuffer ResultBytes) = 0;
-    static void send(orc_rt_SessionRef Session, void *CallCtx,
+    static void send(orc_rt_SessionRef Session, uint64_t CallId,
                      orc_rt_WrapperFunctionBuffer ResultBytes) {
       std::unique_ptr<DirectResultSender>(
-          reinterpret_cast<DirectResultSender *>(CallCtx))
+          reinterpret_cast<DirectResultSender *>(
+              static_cast<uintptr_t>(CallId)))
           ->send(Session, ResultBytes);
     }
   };
@@ -59,7 +60,8 @@ class DirectCaller {
                   orc_rt::WrapperFunctionBuffer ArgBytes) {
     auto DR =
         makeDirectResultSender(std::forward<HandleResultFn>(HandleResult));
-    Fn(Session, reinterpret_cast<void *>(DR.release()),
+    Fn(Session,
+       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(DR.release())),
        DirectResultSender::send, ArgBytes.release());
   }
 
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index 81e5755e821f3..3b1592d89d8f1 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -22,11 +22,11 @@
 
 using namespace orc_rt;
 
-static void void_noop_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void void_noop_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                   orc_rt_WrapperFunctionReturn Return,
                                   orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<void()>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void()> Return) { Return(); });
 }
 
@@ -40,11 +40,12 @@ TEST(SPSWrapperFunctionUtilsTest, VoidNoop) {
   EXPECT_TRUE(Ran);
 }
 
-static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(int32_t)> Return, int32_t X, int32_t Y) {
         Return(X + Y);
       });
@@ -64,11 +65,11 @@ static void add_via_function(move_only_function<void(int32_t)> Return,
 }
 
 static void
-add_via_function_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+add_via_function_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                              orc_rt_WrapperFunctionReturn Return,
                              orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes, add_via_function);
+      Session, CallId, Return, ArgBytes, add_via_function);
 }
 
 TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) {
@@ -80,11 +81,11 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) {
 }
 
 static void
-add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                      orc_rt_WrapperFunctionReturn Return,
                                      orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes, &add_via_function);
+      Session, CallId, Return, ArgBytes, &add_via_function);
 }
 
 TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) {
@@ -96,11 +97,12 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) {
 }
 
 static void
-round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSString(SPSString)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(std::string)> Return, span<const char> S) {
         Return({S.data(), S.size()});
       });
@@ -119,11 +121,11 @@ TEST(SPSWrapperFunctionUtilsTest, RoundTripStringViaSpan) {
 }
 
 static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSError(bool)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(Error)> Return, bool LuckyHat) {
         if (LuckyHat)
           Return(Error::success());
@@ -155,11 +157,11 @@ TEST(SPSWrapperFunctionUtilsTest, TransparentConversionErrorFailureCase) {
   EXPECT_EQ(ErrMsg, "crushed by boulder");
 }
 
-static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void halve_number_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId,
                                      orc_rt_WrapperFunctionReturn Return,
                                      orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       [](move_only_function<void(Expected<int32_t>)> Return, int N) {
         if (N % 2 == 0)
           Return(N >> 1);
@@ -208,12 +210,12 @@ class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> {
 
 static void
 handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<void(
       SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
-      SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes,
+      SPSOpCounter<3>)>::handle(Session, CallId, Return, ArgBytes,
                                 [](move_only_function<void()> Return,
                                    OpCounter<0>, OpCounter<1> &,
                                    const OpCounter<2> &,
@@ -281,11 +283,11 @@ class Adder {
 } // anonymous namespace
 
 static void adder_add_async_sps_wrapper(orc_rt_SessionRef Session,
-                                        void *CallCtx,
+                                        uint64_t CallId,
                                         orc_rt_WrapperFunctionReturn Return,
                                         orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithAsyncMethod(&Adder::addAsync));
 }
 
@@ -300,11 +302,12 @@ TEST(SPSWrapperFunctionUtilsTest, HandleWtihAsyncMethod) {
   EXPECT_EQ(Result, 42);
 }
 
-static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session,
+                                       uint64_t CallId,
                                        orc_rt_WrapperFunctionReturn Return,
                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
   SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
-      Session, CallCtx, Return, ArgBytes,
+      Session, CallId, Return, ArgBytes,
       WrapperFunction::handleWithSyncMethod(&Adder::addSync));
 }
 

From 464cb33d1114fa3daddd26691fb0935b93b21776 Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Tue, 11 Nov 2025 15:24:38 +0800
Subject: [PATCH 73/97] Revert "[LoongArch] Add `isSafeToMove` hook to prevent
 unsafe instruction motion" (#167463)

Reverts llvm/llvm-project#163725
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   | 11 -----
 llvm/lib/CodeGen/BranchFolding.cpp            |  5 ---
 .../Target/LoongArch/LoongArchInstrInfo.cpp   | 45 ++++++++-----------
 .../lib/Target/LoongArch/LoongArchInstrInfo.h |  3 --
 4 files changed, 18 insertions(+), 46 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 18142c2c0adf3..43f28ed79f9dd 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1765,17 +1765,6 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return true;
   }
 
-  /// Return true if it's safe to move a machine instruction.
-  /// This allows the backend to prevent certain special instruction
-  /// sequences from being broken by instruction motion in optimization
-  /// passes.
-  /// By default, this returns true for every instruction.
-  virtual bool isSafeToMove(const MachineInstr &MI,
-                            const MachineBasicBlock *MBB,
-                            const MachineFunction &MF) const {
-    return true;
-  }
-
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
   virtual bool isSchedulingBoundary(const MachineInstr &MI,
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index af1625a209569..7292bc2be0df2 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1979,7 +1979,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock::iterator FIB = FBB->begin();
   MachineBasicBlock::iterator TIE = TBB->end();
   MachineBasicBlock::iterator FIE = FBB->end();
-  MachineFunction &MF = *MBB->getParent();
   while (TIB != TIE && FIB != FIE) {
     // Skip dbg_value instructions. These do not count.
     TIB = skipDebugInstructionsForward(TIB, TIE, false);
@@ -1994,10 +1993,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       // Hard to reason about register liveness with predicated instruction.
       break;
 
-    if (!TII->isSafeToMove(*TIB, MBB, MF))
-      // Don't hoist the instruction if it isn't safe to move.
-      break;
-
     bool IsSafe = true;
     for (MachineOperand &MO : TIB->operands()) {
       // Don't attempt to hoist instructions with register masks.
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 9fc862af7ea24..9a33dccd002c7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -378,9 +378,12 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   }
 }
 
-bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
-                                      const MachineBasicBlock *MBB,
-                                      const MachineFunction &MF) const {
+bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                              const MachineBasicBlock *MBB,
+                                              const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+
   auto MII = MI.getIterator();
   auto MIE = MBB->end();
 
@@ -426,25 +429,25 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
     auto MO2 = Lu32I->getOperand(2).getTargetFlags();
     if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO &&
         MO2 == LoongArchII::MO_PCREL64_LO)
-      return false;
+      return true;
     if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI ||
          MO0 == LoongArchII::MO_GD_PC_HI) &&
         MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO)
-      return false;
+      return true;
     if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO &&
         MO2 == LoongArchII::MO_IE_PC64_LO)
-      return false;
+      return true;
     if (MO0 == LoongArchII::MO_DESC_PC_HI &&
         MO1 == LoongArchII::MO_DESC_PC_LO &&
         MO2 == LoongArchII::MO_DESC64_PC_LO)
-      return false;
+      return true;
     break;
   }
   case LoongArch::LU52I_D: {
     auto MO = MI.getOperand(2).getTargetFlags();
     if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI ||
         MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI)
-      return false;
+      return true;
     break;
   }
   default:
@@ -484,7 +487,7 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
         auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
         auto MO2 = LoongArchII::getDirectFlags(Ld->getOperand(2));
         if (MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC_LD)
-          return false;
+          return true;
         break;
       }
       if (SecondOp == MIE ||
@@ -493,34 +496,34 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
       auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2));
       if (MO0 == LoongArchII::MO_PCREL_HI && SecondOp->getOpcode() == AddiOp &&
           MO1 == LoongArchII::MO_PCREL_LO)
-        return false;
+        return true;
       if (MO0 == LoongArchII::MO_GOT_PC_HI && SecondOp->getOpcode() == LdOp &&
           MO1 == LoongArchII::MO_GOT_PC_LO)
-        return false;
+        return true;
       if ((MO0 == LoongArchII::MO_LD_PC_HI ||
            MO0 == LoongArchII::MO_GD_PC_HI) &&
           SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_GOT_PC_LO)
-        return false;
+        return true;
       break;
     }
     case LoongArch::ADDI_W:
     case LoongArch::ADDI_D: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_PCREL_LO || MO == LoongArchII::MO_GOT_PC_LO)
-        return false;
+        return true;
       break;
     }
     case LoongArch::LD_W:
     case LoongArch::LD_D: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_GOT_PC_LO)
-        return false;
+        return true;
       break;
     }
     case LoongArch::PseudoDESC_CALL: {
       auto MO = LoongArchII::getDirectFlags(MI.getOperand(2));
       if (MO == LoongArchII::MO_DESC_CALL)
-        return false;
+        return true;
       break;
     }
     default:
@@ -528,18 +531,6 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI,
     }
   }
 
-  return true;
-}
-
-bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
-                                              const MachineBasicBlock *MBB,
-                                              const MachineFunction &MF) const {
-  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
-    return true;
-
-  if (!isSafeToMove(MI, MBB, MF))
-    return true;
-
   return false;
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index 9f7a0a2239a87..796ef9f3a5715 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -66,9 +66,6 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo {
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
-  bool isSafeToMove(const MachineInstr &MI, const MachineBasicBlock *MBB,
-                    const MachineFunction &MF) const override;
-
   bool isSchedulingBoundary(const MachineInstr &MI,
                             const MachineBasicBlock *MBB,
                             const MachineFunction &MF) const override;

From cc184e34ec82cba1c677ba0fe44bcae29926005a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 10 Nov 2025 23:32:37 -0800
Subject: [PATCH 74/97] [Sparc] Use MCRegister instead of unsigned. NFC
 (#167464)

---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 7c1db3cfcd6b4..ef45d31a029d3 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -235,7 +235,7 @@ class SparcOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister Reg;
     RegisterKind Kind;
   };
 
@@ -244,8 +244,8 @@ class SparcOperand : public MCParsedAsmOperand {
   };
 
   struct MemOp {
-    unsigned Base;
-    unsigned OffsetReg;
+    MCRegister Base;
+    MCRegister OffsetReg;
     const MCExpr *Off;
   };
 
@@ -326,7 +326,7 @@ class SparcOperand : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
   const MCExpr *getImm() const {
@@ -334,12 +334,12 @@ class SparcOperand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
-  unsigned getMemBase() const {
+  MCRegister getMemBase() const {
     assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
     return Mem.Base;
   }
 
-  unsigned getMemOffsetReg() const {
+  MCRegister getMemOffsetReg() const {
     assert((Kind == k_MemoryReg) && "Invalid access!");
     return Mem.OffsetReg;
   }
@@ -376,12 +376,16 @@ class SparcOperand : public MCParsedAsmOperand {
   void print(raw_ostream &OS, const MCAsmInfo &MAI) const override {
     switch (Kind) {
     case k_Token:     OS << "Token: " << getToken() << "\n"; break;
-    case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
+    case k_Register:
+      OS << "Reg: #" << getReg().id() << "\n";
+      break;
     case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
-    case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
-                         << getMemOffsetReg() << "\n"; break;
+    case k_MemoryReg:
+      OS << "Mem: " << getMemBase().id() << "+" << getMemOffsetReg().id()
+         << "\n";
+      break;
     case k_MemoryImm: assert(getMemOff() != nullptr);
-      OS << "Mem: " << getMemBase() << "+";
+      OS << "Mem: " << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemOff());
       OS << "\n";
       break;
@@ -432,7 +436,7 @@ class SparcOperand : public MCParsedAsmOperand {
 
     Inst.addOperand(MCOperand::createReg(getMemBase()));
 
-    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    assert(getMemOffsetReg().isValid() && "Invalid offset");
     Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
   }
 
@@ -480,10 +484,10 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+  static std::unique_ptr<SparcOperand> CreateReg(MCRegister Reg, unsigned Kind,
                                                  SMLoc S, SMLoc E) {
     auto Op = std::make_unique<SparcOperand>(k_Register);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.Reg = Reg;
     Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -540,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand {
       regIdx = Reg - Sparc::I0 + 24;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+    Op.Reg.Reg = IntPairRegs[regIdx / 2];
     Op.Reg.Kind = rk_IntPairReg;
     return true;
   }
@@ -551,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand {
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Reg = DoubleRegs[regIdx / 2];
     Op.Reg.Kind = rk_DoubleReg;
     return true;
   }
@@ -574,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand {
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
-    Op.Reg.RegNum = Reg;
+    Op.Reg.Reg = Reg;
     Op.Reg.Kind = rk_QuadReg;
     return true;
   }
@@ -587,13 +591,13 @@ class SparcOperand : public MCParsedAsmOperand {
       regIdx = Reg - Sparc::C0;
     if (regIdx % 2 || regIdx > 31)
       return false;
-    Op.Reg.RegNum = CoprocPairRegs[regIdx / 2];
+    Op.Reg.Reg = CoprocPairRegs[regIdx / 2];
     Op.Reg.Kind = rk_CoprocPairReg;
     return true;
   }
 
   static std::unique_ptr<SparcOperand>
-  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+  MorphToMEMrr(MCRegister Base, std::unique_ptr<SparcOperand> Op) {
     MCRegister offsetReg = Op->getReg();
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
@@ -602,8 +606,8 @@ class SparcOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<SparcOperand>
-  CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
+  static std::unique_ptr<SparcOperand> CreateMEMr(MCRegister Base, SMLoc S,
+                                                  SMLoc E) {
     auto Op = std::make_unique<SparcOperand>(k_MemoryReg);
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = Sparc::G0;  // always 0
@@ -614,11 +618,11 @@ class SparcOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<SparcOperand>
-  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+  MorphToMEMri(MCRegister Base, std::unique_ptr<SparcOperand> Op) {
     const MCExpr *Imm  = Op->getImm();
     Op->Kind = k_MemoryImm;
     Op->Mem.Base = Base;
-    Op->Mem.OffsetReg = 0;
+    Op->Mem.OffsetReg = MCRegister();
     Op->Mem.Off = Imm;
     return Op;
   }

From 321afc76062482fb6a8852806f16d7c71a22e135 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 10 Nov 2025 23:39:31 -0800
Subject: [PATCH 75/97] [VE] Use MCRegister instead of unsigned. NFC (#167460)

---
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp | 76 ++++++++++----------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 20f561a8dac34..9b47d237f0702 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -54,7 +54,7 @@ class VEAsmParser : public MCTargetAsmParser {
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override;
-  int parseRegisterName(MCRegister (*matchFn)(StringRef));
+  MCRegister parseRegisterName(MCRegister (*matchFn)(StringRef));
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                SMLoc &EndLoc) override;
   bool parseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -169,7 +169,7 @@ class VEOperand : public MCParsedAsmOperand {
   };
 
   struct RegOp {
-    unsigned RegNum;
+    MCRegister Reg;
   };
 
   struct ImmOp {
@@ -177,8 +177,8 @@ class VEOperand : public MCParsedAsmOperand {
   };
 
   struct MemOp {
-    unsigned Base;
-    unsigned IndexReg;
+    MCRegister Base;
+    MCRegister IndexReg;
     const MCExpr *Index;
     const MCExpr *Offset;
   };
@@ -342,7 +342,7 @@ class VEOperand : public MCParsedAsmOperand {
 
   MCRegister getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
-    return Reg.RegNum;
+    return Reg.Reg;
   }
 
   const MCExpr *getImm() const {
@@ -350,14 +350,14 @@ class VEOperand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
-  unsigned getMemBase() const {
+  MCRegister getMemBase() const {
     assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
             Kind == k_MemoryRegImm) &&
            "Invalid access!");
     return Mem.Base;
   }
 
-  unsigned getMemIndexReg() const {
+  MCRegister getMemIndexReg() const {
     assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) &&
            "Invalid access!");
     return Mem.IndexReg;
@@ -415,20 +415,21 @@ class VEOperand : public MCParsedAsmOperand {
       OS << "Token: " << getToken() << "\n";
       break;
     case k_Register:
-      OS << "Reg: #" << getReg() << "\n";
+      OS << "Reg: #" << getReg().id() << "\n";
       break;
     case k_Immediate:
       OS << "Imm: " << getImm() << "\n";
       break;
     case k_MemoryRegRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+#" << getMemIndexReg().id()
+         << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
     case k_MemoryRegImmImm:
       assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemIndex());
       OS << "+";
       MAI.printExpr(OS, *getMemOffset());
@@ -436,7 +437,7 @@ class VEOperand : public MCParsedAsmOperand {
       break;
     case k_MemoryZeroRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: 0+#" << getMemIndexReg() << "+";
+      OS << "Mem: 0+#" << getMemIndexReg().id() << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
@@ -450,7 +451,7 @@ class VEOperand : public MCParsedAsmOperand {
       break;
     case k_MemoryRegImm:
       assert(getMemOffset() != nullptr);
-      OS << "Mem: #" << getMemBase() << "+";
+      OS << "Mem: #" << getMemBase().id() << "+";
       MAI.printExpr(OS, *getMemOffset());
       OS << "\n";
       break;
@@ -606,10 +607,10 @@ class VEOperand : public MCParsedAsmOperand {
     return Op;
   }
 
-  static std::unique_ptr<VEOperand> CreateReg(unsigned RegNum, SMLoc S,
+  static std::unique_ptr<VEOperand> CreateReg(MCRegister Reg, SMLoc S,
                                               SMLoc E) {
     auto Op = std::make_unique<VEOperand>(k_Register);
-    Op->Reg.RegNum = RegNum;
+    Op->Reg.Reg = Reg;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -653,38 +654,38 @@ class VEOperand : public MCParsedAsmOperand {
   }
 
   static bool MorphToI32Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx > 63)
       return false;
-    Op.Reg.RegNum = I32Regs[regIdx];
+    Op.Reg.Reg = I32Regs[regIdx];
     return true;
   }
 
   static bool MorphToF32Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx > 63)
       return false;
-    Op.Reg.RegNum = F32Regs[regIdx];
+    Op.Reg.Reg = F32Regs[regIdx];
     return true;
   }
 
   static bool MorphToF128Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::SX0;
     if (regIdx % 2 || regIdx > 63)
       return false;
-    Op.Reg.RegNum = F128Regs[regIdx / 2];
+    Op.Reg.Reg = F128Regs[regIdx / 2];
     return true;
   }
 
   static bool MorphToVM512Reg(VEOperand &Op) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     unsigned regIdx = Reg - VE::VM0;
     if (regIdx % 2 || regIdx > 15)
       return false;
-    Op.Reg.RegNum = VM512Regs[regIdx / 2];
+    Op.Reg.Reg = VM512Regs[regIdx / 2];
     return true;
   }
 
@@ -696,16 +697,16 @@ class VEOperand : public MCParsedAsmOperand {
     if (regIdx > 31 || MISCRegs[regIdx] == VE::NoRegister)
       return false;
     Op.Kind = k_Register;
-    Op.Reg.RegNum = MISCRegs[regIdx];
+    Op.Reg.Reg = MISCRegs[regIdx];
     return true;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMri(unsigned Base, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMri(MCRegister Base, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegImm;
     Op->Mem.Base = Base;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
     return Op;
@@ -715,15 +716,16 @@ class VEOperand : public MCParsedAsmOperand {
   MorphToMEMzi(std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroImm;
-    Op->Mem.Base = 0;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.Base = MCRegister();
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
     return Op;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMrri(MCRegister Base, MCRegister Index,
+                std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegRegImm;
     Op->Mem.Base = Base;
@@ -734,22 +736,22 @@ class VEOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMrii(unsigned Base, const MCExpr *Index,
+  MorphToMEMrii(MCRegister Base, const MCExpr *Index,
                 std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryRegImmImm;
     Op->Mem.Base = Base;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = Index;
     Op->Mem.Offset = Imm;
     return Op;
   }
 
   static std::unique_ptr<VEOperand>
-  MorphToMEMzri(unsigned Index, std::unique_ptr<VEOperand> Op) {
+  MorphToMEMzri(MCRegister Index, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroRegImm;
-    Op->Mem.Base = 0;
+    Op->Mem.Base = MCRegister();
     Op->Mem.IndexReg = Index;
     Op->Mem.Index = nullptr;
     Op->Mem.Offset = Imm;
@@ -760,8 +762,8 @@ class VEOperand : public MCParsedAsmOperand {
   MorphToMEMzii(const MCExpr *Index, std::unique_ptr<VEOperand> Op) {
     const MCExpr *Imm = Op->getImm();
     Op->Kind = k_MemoryZeroImmImm;
-    Op->Mem.Base = 0;
-    Op->Mem.IndexReg = 0;
+    Op->Mem.Base = MCRegister();
+    Op->Mem.IndexReg = MCRegister();
     Op->Mem.Index = Index;
     Op->Mem.Offset = Imm;
     return Op;
@@ -815,14 +817,14 @@ bool VEAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 
 /// Parses a register name using a given matching function.
 /// Checks for lowercase or uppercase if necessary.
-int VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) {
+MCRegister VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) {
   StringRef Name = Parser.getTok().getString();
 
-  int RegNum = matchFn(Name);
+  MCRegister RegNum = matchFn(Name);
 
   // GCC supports case insensitive register names. All of the VE registers
   // are all lower case.
-  if (RegNum == VE::NoRegister) {
+  if (!RegNum) {
     RegNum = matchFn(Name.lower());
   }
 

From db8c38da44e20d1bce0b57bb548d248f470c623e Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Tue, 11 Nov 2025 03:24:02 -0500
Subject: [PATCH 76/97] [NVVM] Make nanosleep op duration SSA value (#167331)

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 6 ++----
 mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 8 --------
 mlir/test/Target/LLVMIR/nvvmir.mlir         | 6 +++---
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 1cc5b74a3cb67..63ad73370a439 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -512,8 +512,7 @@ def NVVM_ReduxOp :
 //===----------------------------------------------------------------------===//
 
 def NVVM_NanosleepOp : NVVM_Op<"nanosleep">,
-  Arguments<(ins 
-  ConfinedAttr<I32Attr, [IntMinValue<1>, IntMaxValue<1000000>]>:$duration)> 
+  Arguments<(ins I32:$duration)> 
 {
   let summary = "Suspends the thread for a specified duration.";
 
@@ -531,8 +530,7 @@ def NVVM_NanosleepOp : NVVM_Op<"nanosleep">,
   
   string llvmBuilder = [{
       createIntrinsicCall(builder, 
-                          llvm::Intrinsic::nvvm_nanosleep, 
-                          {builder.getInt32($duration)});
+                          llvm::Intrinsic::nvvm_nanosleep, {$duration});
   }];
   let assemblyFormat = "attr-dict $duration";
 }
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 42aa2210eae1a..d5868ee73cc50 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -578,14 +578,6 @@ llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
 
 // -----
 
-llvm.func @nanosleep() {
-  // expected-error@+1 {{integer constant out of range for attribute}}
-  nvvm.nanosleep 100000000000000
-  llvm.return
-}
-
-// -----
-
 llvm.func @clusterlaunchcontrol_query_cancel_is_canceled_invalid_return_type(%try_cancel_response: i128) {
   // expected-error@+1 {{'nvvm.clusterlaunchcontrol.query.cancel' op is_canceled query type returns an i1}}
   %res = nvvm.clusterlaunchcontrol.query.cancel query = is_canceled, %try_cancel_response : i32
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 1ec55408e97a5..fec54cbf5e3e5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -970,8 +970,8 @@ llvm.func @nvvm_pmevent() {
 // -----
 
 // CHECK-LABEL: @nanosleep
-llvm.func @nanosleep() {
-  // CHECK: call void @llvm.nvvm.nanosleep(i32 4000)
-  nvvm.nanosleep 4000
+llvm.func @nanosleep(%duration: i32) {
+  // CHECK: call void @llvm.nvvm.nanosleep(i32 %{{.*}})
+  nvvm.nanosleep %duration
   llvm.return
 }

From 531ce54e8ec975d9144a197fc2f755d2311b91f9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 11 Nov 2025 01:00:46 -0800
Subject: [PATCH 77/97] [Hexagon] Fix a warning

This patch fixes:

  llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp:1799:32: error:
  private field 'HRI' is not used [-Werror,-Wunused-private-field]
---
 llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index b378ce47fd1bd..557a0a3f27819 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1796,7 +1796,7 @@ namespace {
 
     const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
-    const HexagonRegisterInfo &HRI;
+    [[maybe_unused]] const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
     BitTracker &BT;
   };

From dfdc69b4c27db5b18188ee038e081da90be49c54 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Tue, 11 Nov 2025 17:01:45 +0800
Subject: [PATCH 78/97] [RISCV][llvm] Preliminary P extension codegen support
 (#162668)

This is the initial support of P extension codegen, it only includes
small part of instructions:
PADD_H, PADD_B,
PSADD_H, PSADD_B,
PAADD_H, PAADD_B,
PSADDU_H, PSADDU_B,
PAADDU_H, PAADDU_B,
PSUB_H, PSUB_B,
PDIF_H, PDIF_B,
PSSUB_H, PSSUB_B,
PASUB_H, PASUB_B,
PDIFU_H, PDIFU_B,
PSSUBU_H, PSSUBU_B,
PASUBU_H, PASUBU_B
---
 .../Target/RISCV/MCTargetDesc/RISCVMatInt.cpp |  29 +
 .../Target/RISCV/MCTargetDesc/RISCVMatInt.h   |   2 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  35 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 218 ++++++++
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   3 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   4 +
 llvm/lib/Target/RISCV/RISCVInstrInfoP.td      |  95 +++-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |  10 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp      |  10 +
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   2 +
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  22 +
 llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll       | 515 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll       | 514 +++++++++++++++++
 13 files changed, 1451 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 26f434b528584..cedaa8679ff1b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -79,6 +79,32 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     }
   }
 
+  if (STI.hasFeature(RISCV::FeatureStdExtP)) {
+    // Check if the immediate is packed i8 or i10
+    int32_t Bit63To32 = Val >> 32;
+    int32_t Bit31To0 = Val;
+    int16_t Bit31To16 = Bit31To0 >> 16;
+    int16_t Bit15To0 = Bit31To0;
+    int8_t Bit15To8 = Bit15To0 >> 8;
+    int8_t Bit7To0 = Bit15To0;
+    if (Bit63To32 == Bit31To0) {
+      if (IsRV64 && isInt<10>(Bit63To32)) {
+        Res.emplace_back(RISCV::PLI_W, Bit63To32);
+        return;
+      }
+      if (Bit31To16 == Bit15To0) {
+        if (isInt<10>(Bit31To16)) {
+          Res.emplace_back(RISCV::PLI_H, Bit31To16);
+          return;
+        }
+        if (Bit15To8 == Bit7To0) {
+          Res.emplace_back(RISCV::PLI_B, Bit15To8);
+          return;
+        }
+      }
+    }
+  }
+
   if (isInt<32>(Val)) {
     // Depending on the active bits in the immediate Value v, the following
     // instruction sequences are emitted:
@@ -562,6 +588,9 @@ OpndKind Inst::getOpndKind() const {
   case RISCV::LUI:
   case RISCV::QC_LI:
   case RISCV::QC_E_LI:
+  case RISCV::PLI_B:
+  case RISCV::PLI_H:
+  case RISCV::PLI_W:
     return RISCVMatInt::Imm;
   case RISCV::ADD_UW:
     return RISCVMatInt::RegX0;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index a82cd650f42fa..5df8edb2ee85a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -21,7 +21,7 @@ namespace RISCVMatInt {
 
 enum OpndKind {
   RegImm, // ADDI/ADDIW/XORI/SLLI/SRLI/SLLI_UW/RORI/BSETI/BCLRI/TH_SRRI
-  Imm,    // LUI/QC_LI/QC_E_LI
+  Imm,    // LUI/QC_LI/QC_E_LI/PLI_B/PLI_H/PLI_W
   RegReg, // SH1ADD/SH2ADD/SH3ADD/PACK
   RegX0,  // ADD_UW
 };
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 907833513c5d1..1cbedb7d141e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -991,6 +991,18 @@ static unsigned getSegInstNF(unsigned Intrinsic) {
   }
 }
 
+static bool isApplicableToPLI(int Val) {
+  // Check if the immediate is packed i8 or i10
+  int16_t Bit31To16 = Val >> 16;
+  int16_t Bit15To0 = Val;
+  int8_t Bit15To8 = Bit15To0 >> 8;
+  int8_t Bit7To0 = Val;
+  if (Bit31To16 != Bit15To0)
+    return false;
+
+  return isInt<10>(Bit31To16) || Bit15To8 == Bit7To0;
+}
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we have already selected.
   if (Node->isMachineOpcode()) {
@@ -1034,6 +1046,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
       Imm = SignExtend64<32>(Imm);
 
+    if (Subtarget->enablePExtCodeGen() && isApplicableToPLI(Imm) &&
+        hasAllWUsers(Node)) {
+      // If it's 4 packed 8-bit integers or 2 packed signed 16-bit integers, we
+      // can simply copy lower 32 bits to higher 32 bits to make it able to
+      // rematerialize to PLI_B or PLI_H
+      Imm = ((uint64_t)Imm << 32) | (Imm & 0xFFFFFFFF);
+    }
+
     ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget).getNode());
     return;
   }
@@ -2654,6 +2674,21 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       CurDAG->RemoveDeadNode(Node);
       return;
     }
+    if (Subtarget->enablePExtCodeGen()) {
+      bool Is32BitCast =
+          (VT == MVT::i32 && (SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16)) ||
+          (SrcVT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16));
+      bool Is64BitCast =
+          (VT == MVT::i64 && (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i16 ||
+                              SrcVT == MVT::v2i32)) ||
+          (SrcVT == MVT::i64 &&
+           (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32));
+      if (Is32BitCast || Is64BitCast) {
+        ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+        CurDAG->RemoveDeadNode(Node);
+        return;
+      }
+    }
     break;
   }
   case ISD::INSERT_SUBVECTOR:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4d86a365a5939..637f1943b8511 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -284,6 +284,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass);
   }
 
+  // fixed vector is stored in GPRs for P extension packed operations
+  if (Subtarget.enablePExtCodeGen()) {
+    if (Subtarget.is64Bit()) {
+      addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass);
+    } else {
+      addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass);
+      addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass);
+    }
+  }
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -492,6 +504,34 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::FTRUNC,       ISD::FRINT,         ISD::FROUND,
       ISD::FROUNDEVEN,   ISD::FCANONICALIZE};
 
+  if (Subtarget.enablePExtCodeGen()) {
+    setTargetDAGCombine(ISD::TRUNCATE);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+    SmallVector<MVT, 2> VTs;
+    if (Subtarget.is64Bit()) {
+      VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8});
+      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+      setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+      setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+      setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+      setOperationAction(ISD::LOAD, MVT::v2i16, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4i8, Custom);
+    } else {
+      VTs.append({MVT::v2i16, MVT::v4i8});
+    }
+    setOperationAction(ISD::UADDSAT, VTs, Legal);
+    setOperationAction(ISD::SADDSAT, VTs, Legal);
+    setOperationAction(ISD::USUBSAT, VTs, Legal);
+    setOperationAction(ISD::SSUBSAT, VTs, Legal);
+    setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal);
+    setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, VTs, Custom);
+    setOperationAction(ISD::BITCAST, VTs, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom);
+  }
+
   if (Subtarget.hasStdExtZfbfmin()) {
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::ConstantFP, MVT::bf16, Expand);
@@ -1776,6 +1816,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false);
 }
 
+TargetLoweringBase::LegalizeTypeAction
+RISCVTargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen())
+    if (VT == MVT::v2i16 || VT == MVT::v4i8)
+      return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL,
                                             LLVMContext &Context,
                                             EVT VT) const {
@@ -4391,6 +4440,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   MVT XLenVT = Subtarget.getXLenVT();
 
   SDLoc DL(Op);
+  // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants
+  if (Subtarget.enablePExtCodeGen()) {
+    bool IsPExtVector =
+        (VT == MVT::v2i16 || VT == MVT::v4i8) ||
+        (Subtarget.is64Bit() &&
+         (VT == MVT::v4i16 || VT == MVT::v8i8 || VT == MVT::v2i32));
+    if (IsPExtVector) {
+      if (SDValue SplatValue = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
+        if (auto *C = dyn_cast<ConstantSDNode>(SplatValue)) {
+          int64_t SplatImm = C->getSExtValue();
+          bool IsValidImm = false;
+
+          // Check immediate range based on vector type
+          if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+            // PLI_B uses 8-bit unsigned or unsigned immediate
+            IsValidImm = isUInt<8>(SplatImm) || isInt<8>(SplatImm);
+            if (isUInt<8>(SplatImm))
+              SplatImm = (int8_t)SplatImm;
+          } else {
+            // PLI_H and PLI_W use 10-bit signed immediate
+            IsValidImm = isInt<10>(SplatImm);
+          }
+
+          if (IsValidImm) {
+            SDValue Imm = DAG.getSignedTargetConstant(SplatImm, DL, XLenVT);
+            return DAG.getNode(RISCVISD::PLI, DL, VT, Imm);
+          }
+        }
+      }
+    }
+  }
 
   // Proper support for f16 requires Zvfh. bf16 always requires special
   // handling. We need to cast the scalar to integer and create an integer
@@ -7546,6 +7626,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
     }
 
+    if (Subtarget.enablePExtCodeGen()) {
+      bool Is32BitCast =
+          (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) ||
+          (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16));
+      bool Is64BitCast =
+          (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 ||
+                              Op0VT == MVT::v2i32)) ||
+          (Op0VT == MVT::i64 &&
+           (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32));
+      if (Is32BitCast || Is64BitCast)
+        return Op;
+    }
+
     // Consider other scalar<->scalar casts as legal if the types are legal.
     // Otherwise expand them.
     if (!VT.isVector() && !Op0VT.isVector()) {
@@ -8218,6 +8311,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     auto *Store = cast<StoreSDNode>(Op);
     SDValue StoredVal = Store->getValue();
     EVT VT = StoredVal.getValueType();
+    if (Subtarget.enablePExtCodeGen()) {
+      if (VT == MVT::v2i16 || VT == MVT::v4i8) {
+        SDValue DL(Op);
+        SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal);
+        SDValue NewStore =
+            DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(),
+                         Store->getPointerInfo(), Store->getBaseAlign(),
+                         Store->getMemOperand()->getFlags());
+        return NewStore;
+      }
+    }
     if (VT == MVT::f64) {
       assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() &&
              !Subtarget.is64Bit() && "Unexpected custom legalisation");
@@ -10500,6 +10604,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
     return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract);
   }
 
+  if (Subtarget.enablePExtCodeGen() && VecVT.isFixedLengthVector()) {
+    if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 &&
+        VecVT != MVT::v4i8 && VecVT != MVT::v2i32)
+      return SDValue();
+    SDValue Extracted = DAG.getBitcast(XLenVT, Vec);
+    unsigned ElemWidth = EltVT.getSizeInBits();
+    SDValue Shamt = DAG.getNode(ISD::MUL, DL, XLenVT, Idx,
+                                DAG.getConstant(ElemWidth, DL, XLenVT));
+    return DAG.getNode(ISD::SRL, DL, XLenVT, Extracted, Shamt);
+  }
+
   // If this is a fixed vector, we need to convert it to a scalable vector.
   MVT ContainerVT = VecVT;
   if (VecVT.isFixedLengthVector()) {
@@ -14642,6 +14757,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) {
+      SDLoc DL(N);
+      SDValue ExtLoad =
+          DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(),
+                         Ld->getBasePtr(), MVT::i32, Ld->getMemOperand());
+      if (N->getValueType(0) == MVT::v2i16) {
+        Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad));
+        Results.push_back(ExtLoad.getValue(1));
+      } else if (N->getValueType(0) == MVT::v4i8) {
+        Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad));
+        Results.push_back(ExtLoad.getValue(1));
+      }
+      return;
+    }
+
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
 
@@ -14997,6 +15127,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes));
     break;
   }
+  case RISCVISD::PASUB:
+  case RISCVISD::PASUBU: {
+    MVT VT = N->getSimpleValueType(0);
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    assert(VT == MVT::v2i16 || VT == MVT::v4i8);
+    MVT NewVT = MVT::v4i16;
+    if (VT == MVT::v4i8)
+      NewVT = MVT::v8i8;
+    SDValue Undef = DAG.getUNDEF(VT);
+    Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op0, Undef});
+    Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op1, Undef});
+    Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1}));
+    return;
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
     // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN<SEW, as the SEW element
     // type is illegal (currently only vXi64 RV32).
@@ -16104,11 +16249,84 @@ static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Min);
 }
 
+// Handle P extension averaging subtraction pattern:
+// (vXiY (trunc (srl (sub ([s|z]ext vXiY:$a), ([s|z]ext vXiY:$b)), 1)))
+// -> PASUB/PASUBU
+static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG,
+                                   const RISCVSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  MVT VecVT = VT.getSimpleVT();
+  if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 &&
+      VecVT != MVT::v4i8 && VecVT != MVT::v2i32)
+    return SDValue();
+
+  // Check if shift amount is 1
+  SDValue ShAmt = N0.getOperand(1);
+  if (ShAmt.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(ShAmt.getNode());
+  if (!BV)
+    return SDValue();
+  SDValue Splat = BV->getSplatValue();
+  if (!Splat)
+    return SDValue();
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat);
+  if (!C)
+    return SDValue();
+  if (C->getZExtValue() != 1)
+    return SDValue();
+
+  // Check for SUB operation
+  SDValue Sub = N0.getOperand(0);
+  if (Sub.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  SDValue LHS = Sub.getOperand(0);
+  SDValue RHS = Sub.getOperand(1);
+
+  // Check if both operands are sign/zero extends from the target
+  // type
+  bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND &&
+                   RHS.getOpcode() == ISD::SIGN_EXTEND;
+  bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND &&
+                   RHS.getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSignExt && !IsZeroExt)
+    return SDValue();
+
+  SDValue A = LHS.getOperand(0);
+  SDValue B = RHS.getOperand(0);
+
+  // Check if the extends are from our target vector type
+  if (A.getValueType() != VT || B.getValueType() != VT)
+    return SDValue();
+
+  // Determine the instruction based on type and signedness
+  unsigned Opc;
+  if (IsSignExt)
+    Opc = RISCVISD::PASUB;
+  else if (IsZeroExt)
+    Opc = RISCVISD::PASUBU;
+  else
+    return SDValue();
+
+  // Create the machine node directly
+  return DAG.getNode(Opc, SDLoc(N), VT, {A, B});
+}
+
 static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
                                       const RISCVSubtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (VT.isFixedLengthVector() && Subtarget.enablePExtCodeGen())
+    return combinePExtTruncate(N, DAG, Subtarget);
+
   // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero
   // extending X. This is safe since we only need the LSB after the shift and
   // shift amounts larger than 31 would produce poison. If we wait until
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index dd62a9cf6c9e2..5cc427c867cfd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -71,6 +71,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool preferScalarizeSplat(SDNode *N) const override;
 
+  /// Customize the preferred legalization strategy for certain types.
+  LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+
   bool softPromoteHalfType() const override { return true; }
 
   /// Return the register type for a given MVT, ensuring vectors are treated
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index e0cdd1186c5c1..9d5421241bf0d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2915,6 +2915,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_UIMM9_LSB000:
           Ok = isShiftedUInt<6, 3>(Imm);
           break;
+        case RISCVOp::OPERAND_SIMM8_UNSIGNED:
+          Ok = isInt<8>(Imm);
+          break;
         case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO:
           Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0);
           break;
@@ -2936,6 +2939,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           // clang-format off
         CASE_OPERAND_SIMM(5)
         CASE_OPERAND_SIMM(6)
+        CASE_OPERAND_SIMM(10)
         CASE_OPERAND_SIMM(11)
         CASE_OPERAND_SIMM(12)
         CASE_OPERAND_SIMM(26)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 4cbbba3aa68cb..7637047aabf2d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -18,7 +18,7 @@
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
-def simm10 : RISCVSImmOp<10>;
+def simm10 : RISCVSImmOp<10>, TImmLeaf<XLenVT, "return isInt<10>(Imm);">;
 
 def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
   let RenderMethod = "addSImm8UnsignedOperands";
@@ -26,7 +26,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
 
 // A 8-bit signed immediate allowing range [-128, 255]
 // but represented as [-128, 127].
-def simm8_unsigned : RISCVOp {
+def simm8_unsigned : RISCVOp, TImmLeaf<XLenVT, "return isInt<8>(Imm);"> {
   let ParserMatchClass = SImm8UnsignedAsmOperand;
   let EncoderMethod = "getImmOpValue";
   let DecoderMethod = "decodeSImmOperand<8>";
@@ -1463,8 +1463,91 @@ let Predicates = [HasStdExtP, IsRV32] in {
 
 def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>;
 
-let Predicates = [HasStdExtP] in
-def : PatGpr<abs, ABS>;
+def SDT_RISCVPLI : SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                        SDTCisInt<0>,
+                                        SDTCisInt<1>]>;
+def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>;
+def SDT_RISCVPASUB : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisInt<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>;
+def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUB>;
 
-let Predicates = [HasStdExtP, IsRV64] in
-def : PatGpr<riscv_absw, ABSW>;
+let Predicates = [HasStdExtP] in {
+  def : PatGpr<abs, ABS>;
+
+  // Basic 8-bit arithmetic patterns
+  def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>;
+
+  // Basic 16-bit arithmetic patterns
+  def: Pat<(XLenVecI16VT (add GPR:$rs1, GPR:$rs2)), (PADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_H GPR:$rs1, GPR:$rs2)>;
+
+  // 8-bit saturating add/sub patterns
+  def: Pat<(XLenVecI8VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_B GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit saturating add/sub patterns
+  def: Pat<(XLenVecI16VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_H GPR:$rs1, GPR:$rs2)>;
+
+  // 8-bit averaging patterns
+  def: Pat<(XLenVecI8VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_B GPR:$rs1, GPR:$rs2)>;
+
+  // 16-bit averaging patterns
+  def: Pat<(XLenVecI16VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_H GPR:$rs1, GPR:$rs2)>;
+  
+  // 8-bit absolute difference patterns
+  def: Pat<(XLenVecI8VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_B GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI8VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_B GPR:$rs1, GPR:$rs2)>;
+  
+  // 16-bit absolute difference patterns
+  def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_H GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_H GPR:$rs1, GPR:$rs2)>;
+  
+
+  // 8-bit PLI SD node pattern
+  def: Pat<(XLenVecI8VT (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>;
+  // 16-bit PLI SD node pattern
+  def: Pat<(XLenVecI16VT (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>;
+
+} // Predicates = [HasStdExtP]
+
+let Predicates = [HasStdExtP, IsRV32] in {
+  // Load/Store patterns
+  def : StPat<store, SW, GPR, v4i8>;
+  def : StPat<store, SW, GPR, v2i16>;
+  def : LdPat<load, LW, v4i8>;
+  def : LdPat<load, LW, v2i16>;
+} // Predicates = [HasStdExtP, IsRV32]
+
+let Predicates = [HasStdExtP, IsRV64] in {
+  def : PatGpr<riscv_absw, ABSW>;
+
+  // 32-bit PLI SD node pattern
+  def: Pat<(v2i32 (riscv_pli simm10:$imm10)), (PLI_W simm10:$imm10)>;
+
+  // 32-bit averaging-sub patterns
+  def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>;
+  def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>;
+
+  // Load/Store patterns
+  def : StPat<store, SD, GPR, v8i8>;
+  def : StPat<store, SD, GPR, v4i16>;
+  def : StPat<store, SD, GPR, v2i32>;
+  def : LdPat<load, LD, v8i8>;
+  def : LdPat<load, LD, v4i16>;
+  def : LdPat<load, LD, v2i32>;
+} // Predicates = [HasStdExtP, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 6605a5ccdfde2..87095e75d5dc4 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -222,6 +222,12 @@ def XLenFVT : ValueTypeByHwMode<[RV64],
                                 [f64]>;
 def XLenPairFVT : ValueTypeByHwMode<[RV32],
                                     [f64]>;
+
+// P extension
+def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64],
+                                    [v4i8, v8i8]>;
+def XLenVecI16VT : ValueTypeByHwMode<[RV32,  RV64],
+                                     [v2i16, v4i16]>;
 def XLenRI : RegInfoByHwMode<
       [RV32,              RV64],
       [RegInfo<32,32,32>, RegInfo<64,64,64>]>;
@@ -238,7 +244,9 @@ class RISCVRegisterClass<list<ValueType> regTypes, int align, dag regList>
 }
 
 class GPRRegisterClass<dag regList>
-    : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> {
+    : RISCVRegisterClass<[XLenVT, XLenFVT,
+                          // P extension packed vector types:
+                          XLenVecI8VT, XLenVecI16VT, v2i32], 32, regList> {
   let RegInfos = XLenRI;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 3b43be3d15743..926cc9ea547a6 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> UseMIPSCCMovInsn("use-riscv-mips-ccmov",
                                       cl::desc("Use 'mips.ccmov' instruction"),
                                       cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnablePExtCodeGen(
+    "enable-p-ext-codegen",
+    cl::desc("Turn on P Extension codegen(This is a temporary switch where "
+             "only partial codegen is currently supported)"),
+    cl::init(false), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -145,6 +151,10 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const {
   return !RISCVDisableUsingConstantPoolForLargeInts;
 }
 
+bool RISCVSubtarget::enablePExtCodeGen() const {
+  return HasStdExtP && EnablePExtCodeGen;
+}
+
 unsigned RISCVSubtarget::getMaxBuildIntsCost() const {
   // Loading integer from constant pool needs two instructions (the reason why
   // the minimum cost is 2): an address calculation instruction and a load
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4026364c90e6c..f05115dbeb8cb 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -322,6 +322,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     }
   }
 
+  bool enablePExtCodeGen() const;
+
   // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the
   // vector hardware implementation which may be less than VLEN.
   unsigned getDLenFactor() const {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 3d8eb4097604a..dca6e9cffebb0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead(
   if (isa<ScalableVectorType>(Ty))
     return InstructionCost::getInvalid();
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Ty)) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   // A build_vector (which is m1 sized or smaller) can be done in no
   // worse than one vslide1down.vx per element in the type.  We could
   // in theory do an explode_vector in the inverse manner, but our
@@ -1625,6 +1632,14 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (!IsVectorType)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() &&
+      (isa<FixedVectorType>(Dst) || isa<FixedVectorType>(Src))) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   // FIXME: Need to compute legalizing cost for illegal types.  The current
   // code handles only legal types and those which can be trivially
   // promoted to legal.
@@ -2323,6 +2338,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                  const Value *Op1) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
+  // For now, skip all fixed vector cost analysis when P extension is available
+  // to avoid crashes in getMinRVVVectorSizeInBits()
+  if (ST->enablePExtCodeGen() && isa<FixedVectorType>(Val)) {
+    return 1; // Treat as single instruction cost for now
+  }
+
   if (Opcode != Instruction::ExtractElement &&
       Opcode != Instruction::InsertElement)
     return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
new file mode 100644
index 0000000000000..46d5e9f9a538f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s
+
+; Test basic add/sub operations for v2i16
+define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    padd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = add <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = sub <2 x i16> %a, %b
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test basic add/sub operations for v4i8
+define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    padd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = add <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = sub <4 x i8> %a, %b
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v2i16
+define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psadd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psaddu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v2i16
+define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssubu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b)
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v4i8
+define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psadd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    psaddu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v4i8
+define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pssubu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b)
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v2i16
+define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paadd.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %ext.a = sext <2 x i16> %a to <2 x i32>
+  %ext.b = sext <2 x i16> %b to <2 x i32>
+  %add = add nsw <2 x i32> %ext.a, %ext.b
+  %shift = ashr <2 x i32> %add, <i32 1, i32 1>
+  %res = trunc <2 x i32> %shift to <2 x i16>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v2i16
+define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paaddu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %and = and <2 x i16> %a, %b
+  %xor = xor <2 x i16> %a, %b
+  %shift = lshr <2 x i16> %xor, <i16 1, i16 1>
+  %res = add <2 x i16> %and, %shift
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v4i8
+define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paadd.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %ext.a = sext <4 x i8> %a to <4 x i16>
+  %ext.b = sext <4 x i8> %b to <4 x i16>
+  %add = add nsw <4 x i16> %ext.a, %ext.b
+  %shift = ashr <4 x i16> %add, <i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <4 x i16> %shift to <4 x i8>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v4i8
+define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    paaddu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %and = and <4 x i8> %a, %b
+  %xor = xor <4 x i8> %a, %b
+  %shift = lshr <4 x i8> %xor, <i8 1, i8 1, i8 1, i8 1>
+  %res = add <4 x i8> %and, %shift
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v2i16
+define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdif.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %res = sub <2 x i16> %max, %min
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v2i16
+define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdifu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b)
+  %res = sub <2 x i16> %max, %min
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v4i8
+define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdif.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %res = sub <4 x i8> %max, %min
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v4i8
+define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pdifu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b)
+  %res = sub <4 x i8> %max, %min
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v2i16
+; pasub pattern: (a - b) arithmetic shift right 1
+define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasub.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %a_ext = sext <2 x i16> %a to <2 x i32>
+  %b_ext = sext <2 x i16> %b to <2 x i32>
+  %sub = sub <2 x i32> %a_ext, %b_ext
+  %res = ashr <2 x i32> %sub, <i32 1, i32 1>
+  %res_trunc = trunc <2 x i32> %res to <2 x i16>
+  store <2 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v2i16
+; pasubu pattern: (a - b) logical shift right 1
+define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasubu.h a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %b = load <2 x i16>, ptr %b_ptr
+  %a_ext = zext <2 x i16> %a to <2 x i32>
+  %b_ext = zext <2 x i16> %b to <2 x i32>
+  %sub = sub <2 x i32> %a_ext, %b_ext
+  %res = lshr <2 x i32> %sub, <i32 1, i32 1>
+  %res_trunc = trunc <2 x i32> %res to <2 x i16>
+  store <2 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v4i8
+define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasub.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %a_ext = sext <4 x i8> %a to <4 x i16>
+  %b_ext = sext <4 x i8> %b to <4 x i16>
+  %sub = sub <4 x i16> %a_ext, %b_ext
+  %res = ashr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <4 x i16> %res to <4 x i8>
+  store <4 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v4i8
+define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    pasubu.b a1, a1, a2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %b = load <4 x i8>, ptr %b_ptr
+  %a_ext = zext <4 x i8> %a to <4 x i16>
+  %b_ext = zext <4 x i8> %b to <4 x i16>
+  %sub = sub <4 x i16> %a_ext, %b_ext
+  %res = lshr <4 x i16> %sub, <i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <4 x i16> %res to <4 x i8>
+  store <4 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI (pack load immediate) for v2i16
+define void @test_pli_h(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, 42
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i16> <i16 42, i16 42>, <i16 0, i16 0>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pli_h_negative(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h_negative:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, -5
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i16> <i16 -5, i16 -5>, <i16 0, i16 0>
+  store <2 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v4i8 with unsigned immediate
+define void @test_pli_b(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.b a1, 32
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <4 x i8> <i8 32, i8 32, i8 32, i8 32>, <i8 0, i8 0, i8 0, i8 0>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pli_b_negative(ptr %ret_ptr) {
+; CHECK-RV32-LABEL: test_pli_b_negative:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    pli.b a1, -2
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_pli_b_negative:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    pli.h a1, -258
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+  %res = add <4 x i8> <i8 -2, i8 -2, i8 -2, i8 -2>, <i8 0, i8 0, i8 0, i8 0>
+  store <4 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    sh a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i16>, ptr %a_ptr
+  %extracted = extractelement <2 x i16> %a, i32 0
+  store i16 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    sb a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %a_ptr
+  %extracted = extractelement <4 x i8> %a, i32 0
+  store i8 %extracted, ptr %ret_ptr
+  ret void
+}
+
+; Intrinsic declarations
+declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>)
+declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>)
+declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>)
+declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>)
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
new file mode 100644
index 0000000000000..000a95fb6e0f8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -0,0 +1,514 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s
+
+; Test basic add/sub operations for v4i16
+define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    padd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = add <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = sub <4 x i16> %a, %b
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test basic add/sub operations for v8i8
+define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_padd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    padd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = add <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = sub <8 x i8> %a, %b
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v4i16
+define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psadd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psaddu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v4i16
+define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssubu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating add operations for v8i8
+define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psadd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_psaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    psaddu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test saturating sub operations for v8i8
+define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pssubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pssubu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v4i16
+; avgfloors pattern: (a + b) arithmetic shift right 1
+define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paadd.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %ext.a = sext <4 x i16> %a to <4 x i32>
+  %ext.b = sext <4 x i16> %b to <4 x i32>
+  %add = add nsw <4 x i32> %ext.a, %ext.b
+  %shift = ashr <4 x i32> %add, <i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <4 x i32> %shift to <4 x i16>
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v4i16
+; avgflooru pattern: (a & b) + ((a ^ b) >> 1)
+define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paaddu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %and = and <4 x i16> %a, %b
+  %xor = xor <4 x i16> %a, %b
+  %shift = lshr <4 x i16> %xor, <i16 1, i16 1, i16 1, i16 1>
+  %res = add <4 x i16> %and, %shift
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor signed operations for v8i8
+define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paadd_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paadd.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %ext.a = sext <8 x i8> %a to <8 x i16>
+  %ext.b = sext <8 x i8> %b to <8 x i16>
+  %add = add nsw <8 x i16> %ext.a, %ext.b
+  %shift = ashr <8 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <8 x i16> %shift to <8 x i8>
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor unsigned operations for v8i8
+define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_paaddu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    paaddu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %and = and <8 x i8> %a, %b
+  %xor = xor <8 x i8> %a, %b
+  %shift = lshr <8 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <8 x i8> %and, %shift
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v4i16
+; abds pattern: sub(smax(a,b), smin(a,b))
+define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdif.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %res = sub <4 x i16> %max, %min
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v4i16
+; abdu pattern: sub(umax(a,b), umin(a,b))
+define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdifu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %res = sub <4 x i16> %max, %min
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference signed for v8i8
+define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdif_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdif.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %res = sub <8 x i8> %max, %min
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test absolute difference unsigned for v8i8
+define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pdifu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pdifu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %res = sub <8 x i8> %max, %min
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v4i16
+; pasub pattern: (a - b) arithmetic shift right 1
+define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasub.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %a_ext = sext <4 x i16> %a to <4 x i32>
+  %b_ext = sext <4 x i16> %b to <4 x i32>
+  %sub = sub <4 x i32> %a_ext, %b_ext
+  %res = ashr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
+  %res_trunc = trunc <4 x i32> %res to <4 x i16>
+  store <4 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v4i16
+; pasubu pattern: (a - b) logical shift right 1
+define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasubu.h a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %b = load <4 x i16>, ptr %b_ptr
+  %a_ext = zext <4 x i16> %a to <4 x i32>
+  %b_ext = zext <4 x i16> %b to <4 x i32>
+  %sub = sub <4 x i32> %a_ext, %b_ext
+  %res = lshr <4 x i32> %sub, <i32 1, i32 1, i32 1, i32 1>
+  %res_trunc = trunc <4 x i32> %res to <4 x i16>
+  store <4 x i16> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction signed for v8i8
+define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasub_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasub.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %a_ext = sext <8 x i8> %a to <8 x i16>
+  %b_ext = sext <8 x i8> %b to <8 x i16>
+  %sub = sub <8 x i16> %a_ext, %b_ext
+  %res = ashr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <8 x i16> %res to <8 x i8>
+  store <8 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test averaging floor subtraction unsigned for v8i8
+define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {
+; CHECK-LABEL: test_pasubu_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    ld a2, 0(a2)
+; CHECK-NEXT:    pasubu.b a1, a1, a2
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %b = load <8 x i8>, ptr %b_ptr
+  %a_ext = zext <8 x i8> %a to <8 x i16>
+  %b_ext = zext <8 x i8> %b to <8 x i16>
+  %sub = sub <8 x i16> %a_ext, %b_ext
+  %res = lshr <8 x i16> %sub, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res_trunc = trunc <8 x i16> %res to <8 x i8>
+  store <8 x i8> %res_trunc, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI (pack load immediate) for v4i16
+define void @test_pli_h(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.h a1, 100
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <4 x i16> <i16 100, i16 100, i16 100, i16 100>, <i16 0, i16 0, i16 0, i16 0>
+  store <4 x i16> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v8i8 with unsigned immediate
+define void @test_pli_b(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.b a1, 64
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <8 x i8> <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  store <8 x i8> %res, ptr %ret_ptr
+  ret void
+}
+
+; Test PLI for v2i32 with signed immediate
+define void @test_pli_w(ptr %ret_ptr) {
+; CHECK-LABEL: test_pli_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pli.w a1, -256
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    ret
+  %res = add <2 x i32> <i32 -256, i32 -256>, <i32 0, i32 0>
+  store <2 x i32> %res, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sh a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, ptr %a_ptr
+  %extracted = extractelement <4 x i16> %a, i32 0
+  store i16 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sb a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, ptr %a_ptr
+  %extracted = extractelement <8 x i8> %a, i32 0
+  store i8 %extracted, ptr %ret_ptr
+  ret void
+}
+
+define void @test_extract_vector_32(ptr %ret_ptr, ptr %a_ptr) {
+; CHECK-LABEL: test_extract_vector_32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld a1, 0(a1)
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ret
+  %a = load <2 x i32>, ptr %a_ptr
+  %extracted = extractelement <2 x i32> %a, i32 0
+  store i32 %extracted, ptr %ret_ptr
+  ret void
+}
+
+; Intrinsic declarations
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>)
+declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>)

From 658b5e35a7dbb05826f2db3ec636a0d7fcbccfee Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 11 Nov 2025 09:29:55 +0000
Subject: [PATCH 79/97] [llvm][DebugInfo] Adjust ModuleDebugInfoPrinter to
 versioned language names (#167293)

We shouldn't be calling `getUnversionedName` unconditionally because
DWARFv6-enabled Clang will emit versioned language names that are
versioned (i.e., `sourceLanguageName`...see new test case). We need to
check `hasVersionedName` and pick the appropriate API based on that.
Otherwise we assert in `getUnversionedName`.
---
 llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp  | 18 ++++++++++-----
 ...ebuginfofinder-cu-source-language-names.ll | 22 +++++++++++++++++++
 2 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll

diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index f31d625eca14c..9d53c37461ba8 100644
--- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -43,13 +43,19 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M,
   // filenames), so just print a few useful things.
   for (DICompileUnit *CU : Finder.compile_units()) {
     O << "Compile unit: ";
-    auto Lang =
-        dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName());
-    if (!Lang.empty())
-      O << Lang;
+
+    DISourceLanguageName Lang = CU->getSourceLanguage();
+    auto LangStr =
+        Lang.hasVersionedName()
+            ? dwarf::SourceLanguageNameString(
+                  static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()))
+            : dwarf::LanguageString(Lang.getName());
+
+    if (!LangStr.empty())
+      O << LangStr;
     else
-      O << "unknown-language(" << CU->getSourceLanguage().getUnversionedName()
-        << ")";
+      O << "unknown-language(" << CU->getSourceLanguage().getName() << ")";
+
     printFile(O, CU->getFilename(), CU->getDirectory());
     O << '\n';
   }
diff --git a/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll
new file mode 100644
index 0000000000000..aafeb5ceb0db3
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes='print<module-debuginfo>' -disable-output 2>&1 < %s \
+; RUN:   | FileCheck %s
+
+; CHECK: Compile unit: DW_LANG_C99 from /tmp/test1.c
+; CHECK: Compile unit: DW_LNAME_C from /tmp/test2.c
+; CHECK: Compile unit: unknown-language(0) from /tmp/test3.c
+
+!llvm.dbg.cu = !{!0, !6, !10}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "test1.c", directory: "/tmp")
+!2 = !{}
+!3 = !DIFile(filename: "test1.c", directory: "/tmp")
+!4 = !DISubroutineType(types: !7)
+!5 = !{null}
+!6 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !7, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!7 = !DIFile(filename: "test2.c", directory: "/tmp")
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = distinct !DICompileUnit(sourceLanguageName: 0, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!11 = !DIFile(filename: "test3.c", directory: "/tmp")

From 9100001cd08ccb9a4091200c2421ff2aee7829c6 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 11 Nov 2025 10:28:12 +0100
Subject: [PATCH 80/97] [IndVarSimplify] Precommit tests for PR166649 (NFC)

---
 .../IndVarSimplify/floating-point-iv.ll       | 138 ++++++++++++++++++
 1 file changed, 138 insertions(+)

diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
index b1ef50382c070..3ffd446bc3e98 100644
--- a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
+++ b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
@@ -417,3 +417,141 @@ loop:
 exit:
   ret void
 }
+
+; FIXME: These are miscompilation issues.
+define void @test_fp_to_int_irrealizable_initval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_initval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 100000000, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT_INT]] = add nsw i32 [[IV_INT]], -17
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[IV_NEXT_INT]], 25
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 1.000000e+08, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, -1.700000e+01
+  %cmp = fcmp ult float %iv.next, 2.500000e+01
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 25, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT_INT]] = add nuw nsw i32 [[IV_INT]], 17
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[IV_NEXT_INT]], 100000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, 1.700000e+01
+  %cmp = fcmp ugt float %iv.next, 1.000000e+08
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_negative_exitval() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_negative_exitval(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ -25, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT_INT]] = add nsw i32 [[IV_INT]], -17
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT_INT]], -100000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ -2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, -1.700000e+01
+  %cmp = fcmp ult float %iv.next, -1.000000e+08
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval_pow_2_24() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_pow_2_24(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT_INT]] = add nuw nsw i32 [[IV_INT]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[IV_NEXT_INT]], 16777216
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi float [ 0.000000e+00, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd float %iv, 1.000000e+00
+  %cmp = fcmp ugt float %iv.next, 0x4170000000000000
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_fp_to_int_irrealizable_exitval_int64_min() {
+; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_int64_min(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi double [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @opaque()
+; CHECK-NEXT:    [[IV_NEXT]] = fadd double [[IV]], 1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult double [[IV_NEXT]], 0xC3E0000000000000
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi double [ 2.500000e+01, %entry ], [ %iv.next, %loop ]
+  call void @opaque()
+  %iv.next = fadd double %iv, 1.700000e+01
+  %cmp = fcmp ult double %iv.next, 0xC3E0000000000000
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @opaque()

From eaf3a91722fbb0f6598907c9dfa14ca632cc71c4 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 11 Nov 2025 10:28:35 +0100
Subject: [PATCH 81/97] [IndVarSimplify] Ensure fp values can be represented as
 exact integers

When transforming floating-point induction variables into integer ones,
make sure we stay within the bounds of fp values that can be represented
as integers without gaps, i.e., 2^24 and 2^53 for IEEE-754 single and
double precision respectively (both on negative and positive side).

Fixes: https://github.com/llvm/llvm-project/issues/166496.
---
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 18 +++++++++++--
 .../IndVarSimplify/floating-point-iv.ll       | 25 +++++++++----------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 4ba4ba3850e58..eab1d4975ac96 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -196,6 +196,18 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   return true;
 }
 
+// Ensure we stay within the bounds of fp values that can be represented as
+// integers without gaps, which are 2^24 and 2^53 for IEEE-754 single and double
+// precision respectively (both on negative and positive side).
+static bool isRepresentableAsExactInteger(ConstantFP *FPVal, int64_t IntVal) {
+  const auto &InitValueFltSema = FPVal->getValueAPF().getSemantics();
+  if (!APFloat::isIEEELikeFP(InitValueFltSema))
+    return false;
+
+  return isUIntN(APFloat::semanticsPrecision(InitValueFltSema),
+                 AbsoluteValue(IntVal));
+}
+
 /// If the loop has floating induction variable then insert corresponding
 /// integer induction variable if possible.
 /// For example,
@@ -212,7 +224,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
 
   int64_t InitValue;
-  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue) ||
+      !isRepresentableAsExactInteger(InitValueVal, InitValue))
     return false;
 
   // Check IV increment. Reject this PN if increment operation is not
@@ -262,7 +275,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
   int64_t ExitValue;
   if (ExitValueVal == nullptr ||
-      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue) ||
+      !isRepresentableAsExactInteger(ExitValueVal, ExitValue))
     return false;
 
   // Find new predicate for integer comparison.
diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
index 3ffd446bc3e98..c4933678d0391 100644
--- a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
+++ b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll
@@ -418,16 +418,15 @@ exit:
   ret void
 }
 
-; FIXME: These are miscompilation issues.
 define void @test_fp_to_int_irrealizable_initval() {
 ; CHECK-LABEL: @test_fp_to_int_irrealizable_initval(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 100000000, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 1.000000e+08, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    call void @opaque()
-; CHECK-NEXT:    [[IV_NEXT_INT]] = add nsw i32 [[IV_INT]], -17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[IV_NEXT_INT]], 25
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], 2.500000e+01
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -451,10 +450,10 @@ define void @test_fp_to_int_irrealizable_exitval() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 25, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    call void @opaque()
-; CHECK-NEXT:    [[IV_NEXT_INT]] = add nuw nsw i32 [[IV_INT]], 17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[IV_NEXT_INT]], 100000000
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], 1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 1.000000e+08
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -478,10 +477,10 @@ define void @test_fp_to_int_irrealizable_negative_exitval() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ -25, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ -2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    call void @opaque()
-; CHECK-NEXT:    [[IV_NEXT_INT]] = add nsw i32 [[IV_INT]], -17
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT_INT]], -100000000
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], -1.000000e+08
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -505,10 +504,10 @@ define void @test_fp_to_int_irrealizable_exitval_pow_2_24() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_INT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_INT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    call void @opaque()
-; CHECK-NEXT:    [[IV_NEXT_INT]] = add nuw nsw i32 [[IV_INT]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[IV_NEXT_INT]], 16777216
+; CHECK-NEXT:    [[IV_NEXT]] = fadd float [[IV]], 1.000000e+00
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 0x4170000000000000
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void

From 8e3188a47f33374790c7141a2a7256c1c955284a Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 11 Nov 2025 10:36:00 +0100
Subject: [PATCH 82/97] [clang][bytecode] Mark CXXDefaultInitExprs in InitLink
 chain (#166395)

So we know before _what_ entry in the chain we need to look for the
InitList.

Fixes https://github.com/llvm/llvm-project/issues/166171
---
 clang/lib/AST/ByteCode/Compiler.cpp | 11 ++++++-----
 clang/lib/AST/ByteCode/Compiler.h   | 17 +++++++++++++----
 clang/test/AST/ByteCode/cxx14.cpp   | 21 +++++++++++++++++++++
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index f68422c6eb01d..1243380ca8a6b 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -5432,8 +5432,7 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   unsigned EndIndex = 0;
   // Find the init list.
   for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
-    if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
-        InitStack[StartIndex].Kind == InitLink::K_This) {
+    if (InitStack[StartIndex].Kind == InitLink::K_DIE) {
       EndIndex = StartIndex;
       --StartIndex;
       break;
@@ -5446,7 +5445,8 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
       continue;
 
     if (InitStack[StartIndex].Kind != InitLink::K_Field &&
-        InitStack[StartIndex].Kind != InitLink::K_Elem)
+        InitStack[StartIndex].Kind != InitLink::K_Elem &&
+        InitStack[StartIndex].Kind != InitLink::K_DIE)
       break;
   }
 
@@ -5457,7 +5457,8 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
 
   // Emit the instructions.
   for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) {
-    if (InitStack[I].Kind == InitLink::K_InitList)
+    if (InitStack[I].Kind == InitLink::K_InitList ||
+        InitStack[I].Kind == InitLink::K_DIE)
       continue;
     if (!InitStack[I].template emit<Emitter>(this, E))
       return false;
@@ -6328,8 +6329,8 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
 
       unsigned FirstLinkOffset =
           R->getField(cast<FieldDecl>(IFD->chain()[0]))->Offset;
-      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(InitExpr));
       InitLinkScope<Emitter> ILS(this, InitLink::Field(FirstLinkOffset));
+      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(InitExpr));
       if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr,
                                 IsUnion))
         return false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 5c46f75af4da3..0c6cab9276531 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -52,12 +52,14 @@ struct InitLink {
     K_Decl = 3,
     K_Elem = 5,
     K_RVO = 6,
-    K_InitList = 7
+    K_InitList = 7,
+    K_DIE = 8,
   };
 
   static InitLink This() { return InitLink{K_This}; }
   static InitLink InitList() { return InitLink{K_InitList}; }
   static InitLink RVO() { return InitLink{K_RVO}; }
+  static InitLink DIE() { return InitLink{K_DIE}; }
   static InitLink Field(unsigned Offset) {
     InitLink IL{K_Field};
     IL.Offset = Offset;
@@ -668,22 +670,29 @@ template <class Emitter> class InitLinkScope final {
 
   ~InitLinkScope() { this->Ctx->InitStack.pop_back(); }
 
-private:
+public:
   Compiler<Emitter> *Ctx;
 };
 
 template <class Emitter> class InitStackScope final {
 public:
   InitStackScope(Compiler<Emitter> *Ctx, bool Active)
-      : Ctx(Ctx), OldValue(Ctx->InitStackActive) {
+      : Ctx(Ctx), OldValue(Ctx->InitStackActive), Active(Active) {
     Ctx->InitStackActive = Active;
+    if (Active)
+      Ctx->InitStack.push_back(InitLink::DIE());
   }
 
-  ~InitStackScope() { this->Ctx->InitStackActive = OldValue; }
+  ~InitStackScope() {
+    this->Ctx->InitStackActive = OldValue;
+    if (Active)
+      Ctx->InitStack.pop_back();
+  }
 
 private:
   Compiler<Emitter> *Ctx;
   bool OldValue;
+  bool Active;
 };
 
 } // namespace interp
diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp
index 9622311e100cb..57cb42ea4a98b 100644
--- a/clang/test/AST/ByteCode/cxx14.cpp
+++ b/clang/test/AST/ByteCode/cxx14.cpp
@@ -7,3 +7,24 @@ constexpr int(*null_ptr)() = nullptr;
 constexpr int test4 = (*null_ptr)(); // both-error {{must be initialized by a constant expression}} \
                                      // both-note {{evaluates to a null function pointer}}
 
+struct E {
+  int n = 0;
+  struct {
+    void *x = this;
+  };
+  void *y = this;
+};
+constexpr E e1 = E();
+static_assert(e1.x != e1.y, "");
+constexpr E e2 = E{0};
+static_assert(e2.x != e2.y, "");
+
+struct S {
+  int &&a = 2;
+  int b[1]{a};
+};
+constexpr int foo() {
+  S s{12};
+  return s.b[0];
+}
+static_assert(foo() == 12, "");

From 517d7254637df127e6aaf1fe982a8e5d574f9617 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 11 Nov 2025 10:42:54 +0100
Subject: [PATCH 83/97] [LV] Move condition to
 VPPartialReductionRecipe::execute (#166136)

This means that VPExpressions will now be constructed for
VPPartialReductionRecipe's when the loop has tail-folding predication.

Note that control-flow (if/else) predication is not yet handled
for partial reductions, because of the way partial reductions
are recognised and built up.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  38 +-
 .../partial-reduce-dot-product-epilogue.ll    | 242 ++----
 .../partial-reduce-dot-product-neon.ll        | 726 ++++++------------
 .../AArch64/partial-reduce-dot-product.ll     |   2 +-
 .../LoopVectorize/AArch64/vplan-printing.ll   |  71 ++
 6 files changed, 424 insertions(+), 664 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 566d6eafee63e..19823263a23bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8226,15 +8226,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
   }
 
   VPValue *Cond = nullptr;
-  if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) {
-    assert((ReductionOpcode == Instruction::Add ||
-            ReductionOpcode == Instruction::Sub) &&
-           "Expected an ADD or SUB operation for predicated partial "
-           "reductions (because the neutral element in the mask is zero)!");
+  if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent()))
     Cond = getBlockInMask(Builder.getInsertBlock());
-    VPValue *Zero = Plan.getConstantInt(ReductionI->getType(), 0);
-    BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
-  }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
                                       ScaleFactor, ReductionI);
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 707886f873fba..9a5591feb3d05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -311,18 +311,27 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   std::optional<unsigned> Opcode;
   VPValue *Op = getVecOp();
   uint64_t MulConst;
+
+  InstructionCost CondCost = 0;
+  if (isConditional()) {
+    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    auto *VecTy = Ctx.Types.inferScalarType(Op);
+    auto *CondTy = Ctx.Types.inferScalarType(getCondOp());
+    CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy,
+                                          Pred, Ctx.CostKind);
+  }
+
   // If the partial reduction is predicated, a select will be operand 1.
   // If it isn't predicated and the mul isn't operating on a constant, then it
   // should have been turned into a VPExpressionRecipe.
   // FIXME: Replace the entire function with this once all partial reduction
   // variants are bundled into VPExpressionRecipe.
-  if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
-      !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
+  if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
     auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
     auto *InputType = Ctx.Types.inferScalarType(getVecOp());
-    return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
-                                           PhiType, VF, TTI::PR_None,
-                                           TTI::PR_None, {}, Ctx.CostKind);
+    return CondCost + Ctx.TTI.getPartialReductionCost(
+                          getOpcode(), InputType, InputType, PhiType, VF,
+                          TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind);
   }
 
   VPRecipeBase *OpR = Op->getDefiningRecipe();
@@ -381,12 +390,13 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   } else if (auto Widen = dyn_cast<VPWidenRecipe>(OpR)) {
     HandleWiden(Widen);
   } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(OpR)) {
-    return Reduction->computeCost(VF, Ctx);
+    return CondCost + Reduction->computeCost(VF, Ctx);
   }
   auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
-  return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB,
-                                         PhiType, VF, ExtAType, ExtBType,
-                                         Opcode, Ctx.CostKind);
+  return CondCost + Ctx.TTI.getPartialReductionCost(
+                        getOpcode(), InputTypeA, InputTypeB, PhiType, VF,
+                        ExtAType, ExtBType, Opcode, Ctx.CostKind);
+  ;
 }
 
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
@@ -395,12 +405,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
   assert(getOpcode() == Instruction::Add &&
          "Unhandled partial reduction opcode");
 
-  Value *BinOpVal = State.get(getOperand(1));
-  Value *PhiVal = State.get(getOperand(0));
+  Value *BinOpVal = State.get(getVecOp());
+  Value *PhiVal = State.get(getChainOp());
   assert(PhiVal && BinOpVal && "Phi and Mul must be set");
 
   Type *RetTy = PhiVal->getType();
 
+  if (isConditional()) {
+    Value *Cond = State.get(getCondOp());
+    Value *Zero = ConstantInt::get(BinOpVal->getType(), 0);
+    BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero);
+  }
+
   CallInst *V =
       Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
                               {PhiVal, BinOpVal}, nullptr, "partial.reduce");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index d8f1a86c9ebda..5b9bd0997f2fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -182,313 +182,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
 ; CHECK-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK:       pred.load.continue2:
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue4:
 ; CHECK-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK:       pred.load.continue8:
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK:       pred.load.if9:
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK:       pred.load.continue10:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK:       pred.load.continue12:
 ; CHECK-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK:       pred.load.continue16:
 ; CHECK-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK:       pred.load.if17:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK:       pred.load.continue18:
 ; CHECK-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK:       pred.load.if19:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK:       pred.load.continue20:
 ; CHECK-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK:       pred.load.if21:
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK:       pred.load.continue22:
 ; CHECK-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK:       pred.load.if23:
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK:       pred.load.continue24:
 ; CHECK-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK:       pred.load.if25:
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK:       pred.load.continue26:
 ; CHECK-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK:       pred.load.if27:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK:       pred.load.continue28:
 ; CHECK-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK:       pred.load.if29:
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK:       pred.load.continue30:
-; CHECK-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK:       pred.load.if31:
-; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK:       pred.load.continue32:
-; CHECK-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK:       pred.load.if33:
-; CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK:       pred.load.continue34:
-; CHECK-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK:       pred.load.if35:
-; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK:       pred.load.continue36:
-; CHECK-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK:       pred.load.if37:
-; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK:       pred.load.continue38:
-; CHECK-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK:       pred.load.if39:
-; CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK:       pred.load.continue40:
-; CHECK-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK:       pred.load.if41:
-; CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK:       pred.load.continue42:
-; CHECK-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK:       pred.load.if43:
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK:       pred.load.continue44:
-; CHECK-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK:       pred.load.if45:
-; CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK:       pred.load.continue46:
-; CHECK-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK:       pred.load.if47:
-; CHECK-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK:       pred.load.continue48:
-; CHECK-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK:       pred.load.if49:
-; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK:       pred.load.continue50:
-; CHECK-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK:       pred.load.if51:
-; CHECK-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK:       pred.load.continue52:
-; CHECK-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK:       pred.load.if53:
-; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK:       pred.load.continue54:
-; CHECK-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK:       pred.load.if55:
-; CHECK-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK:       pred.load.continue56:
-; CHECK-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK:       pred.load.if57:
-; CHECK-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK:       pred.load.continue58:
-; CHECK-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK:       pred.load.if59:
-; CHECK-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK:       pred.load.continue60:
-; CHECK-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK:       pred.load.if61:
 ; CHECK-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK:       pred.load.continue62:
-; CHECK-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK:       pred.load.continue30:
+; CHECK-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index e74830700776c..6ead2a4eecbe8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -997,313 +997,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-INTERLEAVE1:       pred.load.continue:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if1:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-INTERLEAVE1:       pred.load.continue2:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if3:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-INTERLEAVE1:       pred.load.continue4:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if5:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-INTERLEAVE1:       pred.load.continue6:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if7:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-INTERLEAVE1:       pred.load.continue8:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if9:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-INTERLEAVE1:       pred.load.continue10:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if11:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-INTERLEAVE1:       pred.load.continue12:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if13:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-INTERLEAVE1:       pred.load.continue14:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if15:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-INTERLEAVE1:       pred.load.continue16:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if17:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-INTERLEAVE1:       pred.load.continue18:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if19:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-INTERLEAVE1:       pred.load.continue20:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if21:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-INTERLEAVE1:       pred.load.continue22:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if23:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-INTERLEAVE1:       pred.load.continue24:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if25:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-INTERLEAVE1:       pred.load.continue26:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-INTERLEAVE1:       pred.load.if27:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-INTERLEAVE1:       pred.load.continue28:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-INTERLEAVE1:       pred.load.if29:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-INTERLEAVE1:       pred.load.continue30:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if31:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-INTERLEAVE1:       pred.load.continue32:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if33:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-INTERLEAVE1:       pred.load.continue34:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if35:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-INTERLEAVE1:       pred.load.continue36:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if37:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-INTERLEAVE1:       pred.load.continue38:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if39:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-INTERLEAVE1:       pred.load.continue40:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if41:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-INTERLEAVE1:       pred.load.continue42:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if43:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-INTERLEAVE1:       pred.load.continue44:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if45:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-INTERLEAVE1:       pred.load.continue46:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if47:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-INTERLEAVE1:       pred.load.continue48:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if49:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-INTERLEAVE1:       pred.load.continue50:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if51:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-INTERLEAVE1:       pred.load.continue52:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if53:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-INTERLEAVE1:       pred.load.continue54:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if55:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-INTERLEAVE1:       pred.load.continue56:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if57:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-INTERLEAVE1:       pred.load.continue58:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-INTERLEAVE1:       pred.load.if59:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-INTERLEAVE1:       pred.load.continue60:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVE1:       pred.load.if61:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVE1:       pred.load.continue62:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-INTERLEAVE1:       pred.load.continue30:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
@@ -1333,313 +1253,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-INTERLEAVED:       pred.load.continue:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if1:
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-INTERLEAVED:       pred.load.continue2:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if3:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-INTERLEAVED:       pred.load.continue4:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if5:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-INTERLEAVED:       pred.load.continue6:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if7:
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-INTERLEAVED:       pred.load.continue8:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if9:
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-INTERLEAVED:       pred.load.continue10:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if11:
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-INTERLEAVED:       pred.load.continue12:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if13:
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-INTERLEAVED:       pred.load.continue14:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if15:
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-INTERLEAVED:       pred.load.continue16:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if17:
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-INTERLEAVED:       pred.load.continue18:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if19:
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-INTERLEAVED:       pred.load.continue20:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if21:
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-INTERLEAVED:       pred.load.continue22:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if23:
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-INTERLEAVED:       pred.load.continue24:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if25:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-INTERLEAVED:       pred.load.continue26:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-INTERLEAVED:       pred.load.if27:
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-INTERLEAVED:       pred.load.continue28:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-INTERLEAVED:       pred.load.if29:
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-INTERLEAVED:       pred.load.continue30:
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if31:
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-INTERLEAVED:       pred.load.continue32:
-; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if33:
-; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-INTERLEAVED:       pred.load.continue34:
-; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if35:
-; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-INTERLEAVED:       pred.load.continue36:
-; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if37:
-; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-INTERLEAVED:       pred.load.continue38:
-; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if39:
-; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-INTERLEAVED:       pred.load.continue40:
-; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if41:
-; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-INTERLEAVED:       pred.load.continue42:
-; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if43:
-; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-INTERLEAVED:       pred.load.continue44:
-; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if45:
-; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-INTERLEAVED:       pred.load.continue46:
-; CHECK-INTERLEAVED-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if47:
-; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-INTERLEAVED:       pred.load.continue48:
-; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if49:
-; CHECK-INTERLEAVED-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-INTERLEAVED:       pred.load.continue50:
-; CHECK-INTERLEAVED-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if51:
-; CHECK-INTERLEAVED-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-INTERLEAVED:       pred.load.continue52:
-; CHECK-INTERLEAVED-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if53:
-; CHECK-INTERLEAVED-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-INTERLEAVED:       pred.load.continue54:
-; CHECK-INTERLEAVED-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if55:
-; CHECK-INTERLEAVED-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-INTERLEAVED:       pred.load.continue56:
-; CHECK-INTERLEAVED-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if57:
-; CHECK-INTERLEAVED-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-INTERLEAVED:       pred.load.continue58:
-; CHECK-INTERLEAVED-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-INTERLEAVED:       pred.load.if59:
-; CHECK-INTERLEAVED-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-INTERLEAVED:       pred.load.continue60:
-; CHECK-INTERLEAVED-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVED:       pred.load.if61:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-INTERLEAVED:       pred.load.continue62:
-; CHECK-INTERLEAVED-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-INTERLEAVED:       pred.load.continue30:
+; CHECK-INTERLEAVED-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
@@ -1669,313 +1509,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-MAXBW:       pred.load.if:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-MAXBW:       pred.load.continue:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
+; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK-MAXBW:       pred.load.if1:
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-MAXBW:       pred.load.continue2:
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
+; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK-MAXBW:       pred.load.if3:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK-MAXBW:       pred.load.continue4:
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
+; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
 ; CHECK-MAXBW:       pred.load.if5:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK-MAXBW:       pred.load.continue6:
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
+; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
 ; CHECK-MAXBW:       pred.load.if7:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
 ; CHECK-MAXBW:       pred.load.continue8:
 ; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
+; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
 ; CHECK-MAXBW:       pred.load.if9:
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
 ; CHECK-MAXBW:       pred.load.continue10:
 ; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
+; CHECK-MAXBW-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
 ; CHECK-MAXBW:       pred.load.if11:
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
 ; CHECK-MAXBW:       pred.load.continue12:
 ; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
+; CHECK-MAXBW-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
 ; CHECK-MAXBW:       pred.load.if13:
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-MAXBW-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-MAXBW-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK-MAXBW:       pred.load.continue14:
 ; CHECK-MAXBW-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
+; CHECK-MAXBW-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
 ; CHECK-MAXBW:       pred.load.if15:
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
 ; CHECK-MAXBW:       pred.load.continue16:
 ; CHECK-MAXBW-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
+; CHECK-MAXBW-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
 ; CHECK-MAXBW:       pred.load.if17:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-MAXBW-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
 ; CHECK-MAXBW:       pred.load.continue18:
 ; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
+; CHECK-MAXBW-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
 ; CHECK-MAXBW:       pred.load.if19:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-MAXBW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-MAXBW-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
 ; CHECK-MAXBW:       pred.load.continue20:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
+; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
 ; CHECK-MAXBW:       pred.load.if21:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
 ; CHECK-MAXBW:       pred.load.continue22:
 ; CHECK-MAXBW-NEXT:    [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ]
+; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
 ; CHECK-MAXBW:       pred.load.if23:
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
 ; CHECK-MAXBW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-MAXBW-NEXT:    [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
 ; CHECK-MAXBW:       pred.load.continue24:
 ; CHECK-MAXBW-NEXT:    [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ]
+; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
 ; CHECK-MAXBW:       pred.load.if25:
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
 ; CHECK-MAXBW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
 ; CHECK-MAXBW:       pred.load.continue26:
 ; CHECK-MAXBW-NEXT:    [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ]
+; CHECK-MAXBW-NEXT:    [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
 ; CHECK-MAXBW:       pred.load.if27:
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-MAXBW-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-MAXBW-NEXT:    [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
 ; CHECK-MAXBW:       pred.load.continue28:
 ; CHECK-MAXBW-NEXT:    [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ]
+; CHECK-MAXBW-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]]
 ; CHECK-MAXBW:       pred.load.if29:
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
-; CHECK-MAXBW:       pred.load.continue30:
-; CHECK-MAXBW-NEXT:    [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
-; CHECK-MAXBW-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
-; CHECK-MAXBW:       pred.load.if31:
-; CHECK-MAXBW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
-; CHECK-MAXBW:       pred.load.continue32:
-; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ]
-; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
-; CHECK-MAXBW-NEXT:    br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
-; CHECK-MAXBW:       pred.load.if33:
-; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
-; CHECK-MAXBW:       pred.load.continue34:
-; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ]
-; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
-; CHECK-MAXBW-NEXT:    br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
-; CHECK-MAXBW:       pred.load.if35:
-; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
-; CHECK-MAXBW:       pred.load.continue36:
-; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ]
-; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
-; CHECK-MAXBW-NEXT:    br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
-; CHECK-MAXBW:       pred.load.if37:
-; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
-; CHECK-MAXBW:       pred.load.continue38:
-; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ]
-; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
-; CHECK-MAXBW-NEXT:    br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
-; CHECK-MAXBW:       pred.load.if39:
-; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
-; CHECK-MAXBW:       pred.load.continue40:
-; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ]
-; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
-; CHECK-MAXBW-NEXT:    br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
-; CHECK-MAXBW:       pred.load.if41:
-; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
-; CHECK-MAXBW:       pred.load.continue42:
-; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ]
-; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
-; CHECK-MAXBW-NEXT:    br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]]
-; CHECK-MAXBW:       pred.load.if43:
-; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
-; CHECK-MAXBW:       pred.load.continue44:
-; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ]
-; CHECK-MAXBW-NEXT:    [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
-; CHECK-MAXBW-NEXT:    br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]]
-; CHECK-MAXBW:       pred.load.if45:
-; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-MAXBW-NEXT:    [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE46]]
-; CHECK-MAXBW:       pred.load.continue46:
-; CHECK-MAXBW-NEXT:    [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ]
-; CHECK-MAXBW-NEXT:    [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
-; CHECK-MAXBW-NEXT:    br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]]
-; CHECK-MAXBW:       pred.load.if47:
-; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-MAXBW-NEXT:    [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE48]]
-; CHECK-MAXBW:       pred.load.continue48:
-; CHECK-MAXBW-NEXT:    [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ]
-; CHECK-MAXBW-NEXT:    [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
-; CHECK-MAXBW-NEXT:    br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]]
-; CHECK-MAXBW:       pred.load.if49:
-; CHECK-MAXBW-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE50]]
-; CHECK-MAXBW:       pred.load.continue50:
-; CHECK-MAXBW-NEXT:    [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ]
-; CHECK-MAXBW-NEXT:    [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
-; CHECK-MAXBW-NEXT:    br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]]
-; CHECK-MAXBW:       pred.load.if51:
-; CHECK-MAXBW-NEXT:    [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE52]]
-; CHECK-MAXBW:       pred.load.continue52:
-; CHECK-MAXBW-NEXT:    [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ]
-; CHECK-MAXBW-NEXT:    [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
-; CHECK-MAXBW-NEXT:    br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]]
-; CHECK-MAXBW:       pred.load.if53:
-; CHECK-MAXBW-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE54]]
-; CHECK-MAXBW:       pred.load.continue54:
-; CHECK-MAXBW-NEXT:    [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ]
-; CHECK-MAXBW-NEXT:    [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12
-; CHECK-MAXBW-NEXT:    br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]]
-; CHECK-MAXBW:       pred.load.if55:
-; CHECK-MAXBW-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE56]]
-; CHECK-MAXBW:       pred.load.continue56:
-; CHECK-MAXBW-NEXT:    [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ]
-; CHECK-MAXBW-NEXT:    [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13
-; CHECK-MAXBW-NEXT:    br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]]
-; CHECK-MAXBW:       pred.load.if57:
-; CHECK-MAXBW-NEXT:    [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE58]]
-; CHECK-MAXBW:       pred.load.continue58:
-; CHECK-MAXBW-NEXT:    [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ]
-; CHECK-MAXBW-NEXT:    [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14
-; CHECK-MAXBW-NEXT:    br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]]
-; CHECK-MAXBW:       pred.load.if59:
-; CHECK-MAXBW-NEXT:    [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14
-; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE60]]
-; CHECK-MAXBW:       pred.load.continue60:
-; CHECK-MAXBW-NEXT:    [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ]
-; CHECK-MAXBW-NEXT:    [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]]
-; CHECK-MAXBW:       pred.load.if61:
 ; CHECK-MAXBW-NEXT:    [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15
 ; CHECK-MAXBW-NEXT:    br label [[PRED_LOAD_CONTINUE62]]
-; CHECK-MAXBW:       pred.load.continue62:
-; CHECK-MAXBW-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ]
+; CHECK-MAXBW:       pred.load.continue30:
+; CHECK-MAXBW-NEXT:    [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ]
+; CHECK-MAXBW-NEXT:    [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]]
 ; CHECK-MAXBW-NEXT:    [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 37eac89acfd11..ab593f6f8bb6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1370,10 +1370,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP18]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 62e248bed85d9..0c3b987a74ece 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -124,6 +124,77 @@ exit:
   ret i32 %add
 }
 
+; Test that we also get VPExpressions when there is predication.
+define i32 @print_partial_reduction_predication(ptr %a, ptr %b, i64 %N) "target-features"="+sve" {
+; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%4> = reduction-start-vector ir<0>, ir<0>, ir<4>
+; CHECK-NEXT:   EMIT vp<%5> = TC > VF ? TC - VF : 0 ir<%N>
+; CHECK-NEXT:   EMIT vp<%index.part.next> = VF * Part + ir<0>
+; CHECK-NEXT:   EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, ir<%N>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%4>, vp<%11> (VF scaled by 1/4)
+; CHECK-NEXT:     vp<%8> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%8>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:     WIDEN ir<%load.a> = load vp<%9>, vp<%7>
+; CHECK-NEXT:     CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%8>
+; CHECK-NEXT:     vp<%10> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:     WIDEN ir<%load.b> = load vp<%10>, vp<%7>
+; CHECK-NEXT:     EXPRESSION vp<%11> = vp<%7> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32), <badref>)
+; CHECK-NEXT:     EMIT vp<%index.next> = add vp<%6>, vp<%1>
+; CHECK-NEXT:     EMIT vp<%12> = VF * Part + vp<%6>
+; CHECK-NEXT:     EMIT vp<%active.lane.mask.next> = active lane mask vp<%12>, vp<%5>, ir<1>
+; CHECK-NEXT:     EMIT vp<%13> = not vp<%active.lane.mask.next>
+; CHECK-NEXT:     EMIT branch-on-cond vp<%13>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%15> = compute-reduction-result ir<%accum>, vp<%11>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%15> from middle.block)
+; CHECK-NEXT: No successors
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !1
+
+exit:
+  ret i32 %add
+}
+
+
 !0 = distinct !{!0, !2, !3}
+!1 = distinct !{!1, !2, !4}
 !2 = !{!"llvm.loop.interleave.count", i32 1}
 !3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
+!4 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}

From 5cb2c6806e8c78776935b2972e4c8b6dfcda697d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 11 Nov 2025 09:46:13 +0000
Subject: [PATCH 84/97] [libunwind][AArch64] Disable ZA before resuming from
 unwinding (on Linux) (#165451)

This patch reimplements the SME ABI `__arm_za_disable` routine within
libunwind. This routine must be called before resuming from unwinding on
AArch64 platforms with SME support.

Before calling the routine, we need to check that SME is available. In
this patch, this is implemented for Linux-based platforms by checking
HWCAP2. It should be possible to implement this check for other
platforms as required.

This patch includes a test for this functionality. This test requires
SME, so on platforms without it, it will simply pass.
---
 libunwind/src/Registers.hpp               |  53 +++++++---
 libunwind/src/UnwindRegistersSave.S       |  62 ++++++++++++
 libunwind/test/aarch64_za_unwind.pass.cpp | 117 ++++++++++++++++++++++
 3 files changed, 217 insertions(+), 15 deletions(-)
 create mode 100644 libunwind/test/aarch64_za_unwind.pass.cpp

diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 5a5b57835379a..9d4c8344150f6 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -20,6 +20,11 @@
 #include "libunwind_ext.h"
 #include "shadow_stack_unwind.h"
 
+#if __has_include(<sys/auxv.h>)
+#include <sys/auxv.h>
+#define HAVE_SYS_AUXV_H
+#endif
+
 namespace libunwind {
 
 // For emulating 128-bit registers
@@ -1828,6 +1833,7 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) {
 /// process.
 class _LIBUNWIND_HIDDEN Registers_arm64;
 extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
+extern "C" int64_t __libunwind_Registers_arm64_za_disable();
 
 #if defined(_LIBUNWIND_USE_GCS)
 extern "C" void *__libunwind_shstk_get_jump_target() {
@@ -1837,7 +1843,7 @@ extern "C" void *__libunwind_shstk_get_jump_target() {
 
 class _LIBUNWIND_HIDDEN Registers_arm64 {
 public:
-  Registers_arm64();
+  Registers_arm64() = default;
   Registers_arm64(const void *registers);
   Registers_arm64(const Registers_arm64 &);
   Registers_arm64 &operator=(const Registers_arm64 &);
@@ -1855,7 +1861,10 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
   v128        getVectorRegister(int num) const;
   void        setVectorRegister(int num, v128 value);
   static const char *getRegisterName(int num);
-  void        jumpto() { __libunwind_Registers_arm64_jumpto(this); }
+  void jumpto() {
+    zaDisable();
+    __libunwind_Registers_arm64_jumpto(this);
+  }
   static constexpr int lastDwarfRegNum() {
     return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64;
   }
@@ -1908,25 +1917,43 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
 private:
   uint64_t lazyGetVG() const;
 
+  void zaDisable() const {
+    if (!_misc_registers.__has_sme)
+      return;
+    if (__libunwind_Registers_arm64_za_disable() != 0)
+      _LIBUNWIND_ABORT("SME ZA disable failed");
+  }
+
+  static bool checkHasSME() {
+#if defined(HAVE_SYS_AUXV_H)
+    constexpr int hwcap2_sme = (1 << 23);
+    unsigned long hwcap2 = getauxval(AT_HWCAP2);
+    return (hwcap2 & hwcap2_sme) != 0;
+#endif
+    // TODO: Support other platforms.
+    return false;
+  }
+
   struct GPRs {
-    uint64_t __x[29]; // x0-x28
-    uint64_t __fp;    // Frame pointer x29
-    uint64_t __lr;    // Link register x30
-    uint64_t __sp;    // Stack pointer x31
-    uint64_t __pc;    // Program counter
-    uint64_t __ra_sign_state; // RA sign state register
+    uint64_t __x[29] = {};        // x0-x28
+    uint64_t __fp = 0;            // Frame pointer x29
+    uint64_t __lr = 0;            // Link register x30
+    uint64_t __sp = 0;            // Stack pointer x31
+    uint64_t __pc = 0;            // Program counter
+    uint64_t __ra_sign_state = 0; // RA sign state register
   };
 
   struct Misc {
-    mutable uint64_t __vg = 0; // Vector Granule
+    mutable uint32_t __vg = 0; // Vector Granule
+    bool __has_sme = checkHasSME();
   };
 
-  GPRs _registers;
+  GPRs _registers = {};
   // Currently only the lower double in 128-bit vectore registers
   // is perserved during unwinding.  We could define new register
   // numbers (> 96) which mean whole vector registers, then this
   // struct would need to change to contain whole vector registers.
-  double _vectorHalfRegisters[32];
+  double _vectorHalfRegisters[32] = {};
 
   // Miscellaneous/virtual registers. These are stored below the GPRs and FPRs
   // as they do not correspond to physical registers, so do not need to be
@@ -1971,10 +1998,6 @@ Registers_arm64::operator=(const Registers_arm64 &other) {
   return *this;
 }
 
-inline Registers_arm64::Registers_arm64() {
-  memset(static_cast<void *>(this), 0, sizeof(*this));
-}
-
 inline bool Registers_arm64::validRegister(int regNum) const {
   if (regNum == UNW_REG_IP)
     return true;
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index b7ddd0a621d18..f988fd461def1 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -829,6 +829,68 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   ret
 #endif
 
+//
+// extern "C" int64_t __libunwind_Registers_arm64_za_disable()
+//
+//   This function implements the requirements of the __arm_za_disable ABI
+//   routine, except that it will not abort; it will return a non-zero value
+//   to signify the routine failed.
+//
+//   Note: This function uses SME instructions. It must only be called if SME
+//   has been confirmed to be available.
+//
+// On return:
+//
+//   A status is placed in x0. A zero value indicates success; any non-zero
+//   value indicates failure.
+//
+  .p2align 2
+DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_za_disable)
+  .variant_pcs __libunwind_Registers_arm64_za_disable
+#if __has_feature(ptrauth_calls)
+  pacibsp
+#endif
+  // If TPIDR2_EL0 is null, the subroutine just disables ZA.
+  .inst 0xd53bd0b0 // mrs x16, TPIDR2_EL0
+  cbz x16, 1f
+
+  // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are
+  // nonzero, return a non-zero value (libunwind will then abort).
+  ldrh w0, [x16, #10]
+  cbnz w0, 2f
+  ldr w0, [x16, #12]
+  cbnz w0, 2f
+
+  // If num_za_save_slices is zero, the subroutine just disables ZA.
+  ldrh w0, [x16, #8]
+  cbz x0, 1f
+
+  // If za_save_buffer is NULL, the subroutine just disables ZA.
+  ldr x16, [x16]
+  cbz x16, 1f
+
+  // Store ZA to za_save_buffer.
+  mov x15, xzr
+0:
+  .inst 0xe1206200 // str za[w15,0], [x16]
+  .inst 0x04305830 // addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x0, x15
+  b.ne 0b
+1:
+  // * Set TPIDR2_EL0 to null.
+  .inst 0xd51bd0bf // msr TPIDR2_EL0, xzr
+  // * Set PSTATE.ZA to 0.
+  .inst 0xd503447f // smstop za
+  // * Return zero (success)
+  mov x0, xzr
+2:
+#if __has_feature(ptrauth_calls)
+  retab
+#else
+  ret
+#endif
+
 #elif defined(__arm__) && !defined(__APPLE__)
 
 #if !defined(__ARM_ARCH_ISA_ARM)
diff --git a/libunwind/test/aarch64_za_unwind.pass.cpp b/libunwind/test/aarch64_za_unwind.pass.cpp
new file mode 100644
index 0000000000000..2985bb8d298de
--- /dev/null
+++ b/libunwind/test/aarch64_za_unwind.pass.cpp
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: linux && target={{aarch64-.+}}
+
+#include <libunwind.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+// Basic test of unwinding with SME lazy saves. This tests libunwind disables ZA
+// (and commits a lazy save of ZA) before resuming from unwinding.
+
+// Note: This test requires SME (and is setup to pass on targets without SME).
+
+static bool checkHasSME() {
+  constexpr int hwcap2_sme = (1 << 23);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+  return (hwcap2 & hwcap2_sme) != 0;
+}
+
+struct TPIDR2Block {
+  void *za_save_buffer;
+  uint64_t num_save_slices;
+};
+
+__attribute__((noinline)) void private_za() {
+  // Note: Lazy save active on entry to function.
+  unw_context_t context;
+  unw_cursor_t cursor;
+
+  unw_getcontext(&context);
+  unw_init_local(&cursor, &context);
+  unw_step(&cursor);
+  unw_resume(&cursor);
+}
+
+bool isZAOn() {
+  register uint64_t svcr asm("x20");
+  asm(".inst 0xd53b4254" : "=r"(svcr));
+  return (svcr & 0b10) != 0;
+}
+
+__attribute__((noinline)) void za_function_with_lazy_save() {
+  register uint64_t tmp asm("x8");
+
+  // SMSTART ZA (should zero ZA)
+  asm(".inst 0xd503457f");
+
+  // RDSVL x8, #1 (read streaming vector length)
+  asm(".inst 0x04bf5828" : "=r"(tmp));
+
+  // Allocate and fill ZA save buffer with 0xAA.
+  size_t buffer_size = tmp * tmp;
+  uint8_t *za_save_buffer = (uint8_t *)alloca(buffer_size);
+  memset(za_save_buffer, 0xAA, buffer_size);
+
+  TPIDR2Block block = {za_save_buffer, tmp};
+  tmp = reinterpret_cast<uint64_t>(&block);
+
+  // MRS TPIDR2_EL0, x8 (setup lazy save of ZA)
+  asm(".inst 0xd51bd0a8" ::"r"(tmp));
+
+  // ZA should be on before unwinding.
+  if (!isZAOn()) {
+    fprintf(stderr, __FILE__ ": fail (ZA not on before call)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (ZA on before call)\n");
+  }
+
+  private_za();
+
+  // ZA should be off after unwinding.
+  if (isZAOn()) {
+    fprintf(stderr, __FILE__ ": fail (ZA on after unwinding)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (ZA off after unwinding)\n");
+  }
+
+  // MRS x8, TPIDR2_EL0 (read TPIDR2_EL0)
+  asm(".inst 0xd53bd0a8" : "=r"(tmp));
+  // ZA should have been saved (TPIDR2_EL0 zero).
+  if (tmp != 0) {
+    fprintf(stderr, __FILE__ ": fail (TPIDR2_EL0 non-null after unwinding)\n");
+    abort();
+  } else {
+    fprintf(stderr, __FILE__ ": pass (TPIDR2_EL0 null after unwinding)\n");
+  }
+
+  // ZA (all zero) should have been saved to the buffer.
+  for (unsigned i = 0; i < buffer_size; ++i) {
+    if (za_save_buffer[i] != 0) {
+      fprintf(stderr,
+              __FILE__ ": fail (za_save_buffer non-zero after unwinding)\n");
+      abort();
+    }
+  }
+  fprintf(stderr, __FILE__ ": pass (za_save_buffer zero'd after unwinding)\n");
+}
+
+int main(int, char **) {
+  if (!checkHasSME()) {
+    fprintf(stderr, __FILE__ ": pass (no SME support)\n");
+    return 0; // Pass (SME is required for this test to run).
+  }
+  za_function_with_lazy_save();
+  return 0;
+}

From c0bca9cb843cb11173222513ce190e0aec2b1d59 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 11 Nov 2025 11:03:02 +0100
Subject: [PATCH 85/97] [libc++] Remove some of the uses of aligned_storage
 inside the library (#161635)

`aligned_storage` has been deprecated and will most likely be removed in
a future version of C++. This patch removes some of its uses to avoid
having to work around its removal in the future.
---
 libcxx/include/__memory/temp_value.h |  3 +--
 libcxx/include/any                   | 10 ++++------
 libcxx/include/future                |  5 +----
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__memory/temp_value.h b/libcxx/include/__memory/temp_value.h
index 4a133b3fbcf6c..5285bcab9a30d 100644
--- a/libcxx/include/__memory/temp_value.h
+++ b/libcxx/include/__memory/temp_value.h
@@ -12,7 +12,6 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory/allocator_traits.h>
-#include <__type_traits/aligned_storage.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,7 +25,7 @@ struct __temp_value {
   typedef allocator_traits<_Alloc> _Traits;
 
 #ifdef _LIBCPP_CXX03_LANG
-  typename aligned_storage<sizeof(_Tp), _LIBCPP_ALIGNOF(_Tp)>::type __v;
+  _ALIGNAS_TYPE(_Tp) char __v[sizeof(_Tp)];
 #else
   union {
     _Tp __v;
diff --git a/libcxx/include/any b/libcxx/include/any
index 148fb16c802a5..b3e5b8748df4c 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -88,7 +88,6 @@ namespace std {
 #  include <__new/allocate.h>
 #  include <__type_traits/add_cv_quals.h>
 #  include <__type_traits/add_pointer.h>
-#  include <__type_traits/aligned_storage.h>
 #  include <__type_traits/conditional.h>
 #  include <__type_traits/decay.h>
 #  include <__type_traits/enable_if.h>
@@ -147,14 +146,13 @@ template <class _ValueType>
 _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT;
 
 namespace __any_imp {
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
-_LIBCPP_SUPPRESS_DEPRECATED_POP
+inline constexpr size_t __small_buffer_size      = 3 * sizeof(void*);
+inline constexpr size_t __small_buffer_alignment = alignof(void*);
 
 template <class _Tp>
 using _IsSmallObject _LIBCPP_NODEBUG =
     integral_constant<bool,
-                      sizeof(_Tp) <= sizeof(_Buffer) && alignof(_Buffer) % alignof(_Tp) == 0 &&
+                      sizeof(_Tp) <= __small_buffer_size && alignof(_Tp) <= __small_buffer_alignment &&
                           is_nothrow_move_constructible<_Tp>::value >;
 
 enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo };
@@ -284,7 +282,7 @@ private:
   union _Storage {
     _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {}
     void* __ptr;
-    __any_imp::_Buffer __buf;
+    alignas(__any_imp::__small_buffer_alignment) char __buf[__any_imp::__small_buffer_size];
   };
 
   _LIBCPP_HIDE_FROM_ABI void*
diff --git a/libcxx/include/future b/libcxx/include/future
index 4b7c09841cbd3..0877d66602e6b 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -584,12 +584,9 @@ inline future_status __assoc_sub_state::wait_for(const chrono::duration<_Rep, _P
 template <class _Rp>
 class _LIBCPP_HIDDEN __assoc_state : public __assoc_sub_state {
   typedef __assoc_sub_state base;
-  _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  typedef typename aligned_storage<sizeof(_Rp), _LIBCPP_ALIGNOF(_Rp)>::type _Up;
-  _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 protected:
-  _Up __value_;
+  _ALIGNAS_TYPE(_Rp) char __value_[sizeof(_Rp)];
 
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __on_zero_shared() _NOEXCEPT override;
 

From dacd2f9b2dde38cf23f767e72374157c408c4d2c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 11 Nov 2025 11:09:07 +0100
Subject: [PATCH 86/97] [libc++][NFC] Make the exception implementation files
 self-contained (#164377)

---
 libcxx/src/exception.cpp                      | 12 ++----------
 .../support/runtime/exception_fallback.ipp    |  2 ++
 .../src/support/runtime/exception_glibcxx.ipp |  3 +++
 .../support/runtime/exception_libcxxabi.ipp   |  8 ++++++--
 .../support/runtime/exception_libcxxrt.ipp    |  2 ++
 libcxx/src/support/runtime/exception_msvc.ipp |  2 ++
 .../runtime/exception_pointer_cxxabi.ipp      | 19 +++++++++----------
 .../runtime/exception_pointer_glibcxx.ipp     |  2 ++
 .../runtime/exception_pointer_msvc.ipp        |  1 +
 .../exception_pointer_unimplemented.ipp       |  1 +
 10 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/libcxx/src/exception.cpp b/libcxx/src/exception.cpp
index ac6324cd9fe35..9932141006591 100644
--- a/libcxx/src/exception.cpp
+++ b/libcxx/src/exception.cpp
@@ -9,20 +9,12 @@
 #define _LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION
 #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
-#include <exception>
-#include <new>
-#include <typeinfo>
-
-#if defined(LIBCXXRT) || defined(LIBCXX_BUILDING_LIBCXXABI)
-#  include <cxxabi.h>
-using namespace __cxxabiv1;
-#  define HAVE_DEPENDENT_EH_ABI 1
-#endif
+#include <__config>
 
 #if defined(_LIBCPP_ABI_MICROSOFT)
 #  include "support/runtime/exception_msvc.ipp"
 #  include "support/runtime/exception_pointer_msvc.ipp"
-#elif defined(_LIBCPPABI_VERSION)
+#elif defined(LIBCXX_BUILDING_LIBCXXABI)
 #  include "support/runtime/exception_libcxxabi.ipp"
 #  include "support/runtime/exception_pointer_cxxabi.ipp"
 #elif defined(LIBCXXRT)
diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp
index ba283aee22901..dca904e902da1 100644
--- a/libcxx/src/support/runtime/exception_fallback.ipp
+++ b/libcxx/src/support/runtime/exception_fallback.ipp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <__verbose_abort>
+#include <exception>
+#include "include/atomic_support.h"
 
 namespace std {
 
diff --git a/libcxx/src/support/runtime/exception_glibcxx.ipp b/libcxx/src/support/runtime/exception_glibcxx.ipp
index aa67cab6bc239..5eb8d87f6d4e1 100644
--- a/libcxx/src/support/runtime/exception_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_glibcxx.ipp
@@ -11,6 +11,9 @@
 #  error header can only be used when targeting libstdc++ or libsupc++
 #endif
 
+#include <exception>
+#include <new>
+
 namespace std {
 
 bad_alloc::bad_alloc() noexcept {}
diff --git a/libcxx/src/support/runtime/exception_libcxxabi.ipp b/libcxx/src/support/runtime/exception_libcxxabi.ipp
index df6bd6574bde2..c42bb237d9db8 100644
--- a/libcxx/src/support/runtime/exception_libcxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_libcxxabi.ipp
@@ -7,6 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <exception>
+
+#include <cxxabi.h>
+
 #ifndef _LIBCPPABI_VERSION
 #  error this header can only be used with libc++abi
 #endif
@@ -17,9 +21,9 @@ bool uncaught_exception() noexcept { return uncaught_exceptions() > 0; }
 
 int uncaught_exceptions() noexcept {
 #if _LIBCPPABI_VERSION > 1001
-  return __cxa_uncaught_exceptions();
+  return abi::__cxa_uncaught_exceptions();
 #else
-  return __cxa_uncaught_exception() ? 1 : 0;
+  return abi::__cxa_uncaught_exception() ? 1 : 0;
 #endif
 }
 
diff --git a/libcxx/src/support/runtime/exception_libcxxrt.ipp b/libcxx/src/support/runtime/exception_libcxxrt.ipp
index f17fecc71e34b..6afdc006563c9 100644
--- a/libcxx/src/support/runtime/exception_libcxxrt.ipp
+++ b/libcxx/src/support/runtime/exception_libcxxrt.ipp
@@ -11,6 +11,8 @@
 #  error this header may only be used when targeting libcxxrt
 #endif
 
+#include <exception>
+
 namespace std {
 
 bad_exception::~bad_exception() noexcept {}
diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp
index 2ae004bb02e5d..7114d90892cc1 100644
--- a/libcxx/src/support/runtime/exception_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_msvc.ipp
@@ -12,6 +12,8 @@
 #endif
 
 #include <__verbose_abort>
+#include <exception>
+#include <new>
 
 extern "C" {
 typedef void(__cdecl* terminate_handler)();
diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
index 8f5c2060bb06c..75cb7c9d82ccd 100644
--- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
@@ -7,22 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HAVE_DEPENDENT_EH_ABI
-#  error this header may only be used with libc++abi or libcxxrt
-#endif
+#include <cxxabi.h>
+#include <exception>
 
 namespace std {
 
-exception_ptr::~exception_ptr() noexcept { __cxa_decrement_exception_refcount(__ptr_); }
+exception_ptr::~exception_ptr() noexcept { abi::__cxa_decrement_exception_refcount(__ptr_); }
 
 exception_ptr::exception_ptr(const exception_ptr& other) noexcept : __ptr_(other.__ptr_) {
-  __cxa_increment_exception_refcount(__ptr_);
+  abi::__cxa_increment_exception_refcount(__ptr_);
 }
 
 exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
   if (__ptr_ != other.__ptr_) {
-    __cxa_increment_exception_refcount(other.__ptr_);
-    __cxa_decrement_exception_refcount(__ptr_);
+    abi::__cxa_increment_exception_refcount(other.__ptr_);
+    abi::__cxa_decrement_exception_refcount(__ptr_);
     __ptr_ = other.__ptr_;
   }
   return *this;
@@ -31,7 +30,7 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
 exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept {
   exception_ptr ptr;
   ptr.__ptr_ = __e;
-  __cxa_increment_exception_refcount(ptr.__ptr_);
+  abi::__cxa_increment_exception_refcount(ptr.__ptr_);
 
   return ptr;
 }
@@ -51,12 +50,12 @@ exception_ptr current_exception() noexcept {
   // this whole function would be just:
   //    return exception_ptr(__cxa_current_primary_exception());
   exception_ptr ptr;
-  ptr.__ptr_ = __cxa_current_primary_exception();
+  ptr.__ptr_ = abi::__cxa_current_primary_exception();
   return ptr;
 }
 
 void rethrow_exception(exception_ptr p) {
-  __cxa_rethrow_primary_exception(p.__ptr_);
+  abi::__cxa_rethrow_primary_exception(p.__ptr_);
   // if p.__ptr_ is NULL, above returns so we terminate
   terminate();
 }
diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
index 174b44ce0e6f7..4b08db6f1ae6f 100644
--- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
@@ -16,6 +16,8 @@
 // stable ABI), and its rethrow_exception(std::__exception_ptr::exception_ptr)
 // function.
 
+#include <exception>
+
 namespace std {
 
 namespace __exception_ptr {
diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
index 2be5136176e32..4141e0312349b 100644
--- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <exception>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
index 05a71ce34e5ac..5e55f0f6dede3 100644
--- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <__verbose_abort>
+#include <exception>
 
 namespace std {
 

From 0400b9aadae8e2f567f0fc7fce02c489c9d06da5 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Tue, 11 Nov 2025 10:14:50 +0000
Subject: [PATCH 87/97] [mlir]: Add handling of escaped memrefs to
 erase_dead_alloc_and_stores (#167255)

Patch updates transform.memref.erase_dead_alloc_and_stores to not delete
escaped allocations.
---
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp |  3 +++
 mlir/test/Dialect/MemRef/transform-ops.mlir   | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 6200366cded29..e6adcde72ad66 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -140,6 +140,9 @@ static bool resultIsNotRead(Operation *op, std::vector<Operation *> &uses) {
   std::vector<Operation *> opUses;
   for (OpOperand &use : op->getUses()) {
     Operation *useOp = use.getOwner();
+    // Use escaped the scope
+    if (useOp->mightHaveTrait<OpTrait::IsTerminator>())
+      return false;
     if (isa<memref::DeallocOp>(useOp) ||
         (useOp->getNumResults() == 0 && useOp->getNumRegions() == 0 &&
          !mlir::hasEffect<MemoryEffects::Read>(useOp)) ||
diff --git a/mlir/test/Dialect/MemRef/transform-ops.mlir b/mlir/test/Dialect/MemRef/transform-ops.mlir
index 3b37c62fcb28e..6e130912c47e9 100644
--- a/mlir/test/Dialect/MemRef/transform-ops.mlir
+++ b/mlir/test/Dialect/MemRef/transform-ops.mlir
@@ -306,6 +306,23 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func.func @dead_alloc_escaped
+func.func @dead_alloc_escaped() -> memref<8x64xf32, 3> {
+  // CHECK: %{{.+}} = memref.alloc
+  %0 = memref.alloc() : memref<8x64xf32, 3>
+  return %0 : memref<8x64xf32, 3>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-LABEL: func.func @dead_alloc
 func.func @dead_alloc() {
   // CHECK-NOT: %{{.+}} = memref.alloc

From c3c4a886b5a6d409433396b5e289fad322e75d4c Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 11 Nov 2025 11:28:59 +0100
Subject: [PATCH 88/97] [clang][bytecode] Add a C test case (#167484)

The original problem does not reproduce anymore, but add the test case.

Fixes https://github.com/llvm/llvm-project/issues/163563
---
 clang/test/AST/ByteCode/c.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index 3360d4f725b24..bffd557ff77a6 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -387,3 +387,8 @@ void bar2(void) {
   int a[2][3][4][5]; // all-note {{array 'a' declared here}}
   foo2(&a[0][4]); // all-warning {{array index 4 is past the end of the array}}
 }
+
+void plainComplex(void) {
+  _Complex cd; // all-warning {{_Complex double}}
+  cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}}
+}

From fdd52f5fe130fb8b98f4aed3d15aa0789cce6b40 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 11 Nov 2025 10:33:55 +0000
Subject: [PATCH 89/97] [VPlan] Handle WidenGEP in narrowToSingleScalars
 (#166740)

This allows us to strip a special case in VPWidenGEP::execute.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 --
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 71 +++++++------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  3 +-
 .../RISCV/gather-scatter-cost.ll              | 14 ++--
 .../widen-gep-all-indices-invariant.ll        | 12 ++--
 5 files changed, 41 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 72858e1265d86..3840b464e6c2c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1787,12 +1787,6 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
     return getOperand(I + 1)->isDefinedOutsideLoopRegions();
   }
 
-  bool areAllOperandsInvariant() const {
-    return all_of(operands(), [](VPValue *Op) {
-      return Op->isDefinedOutsideLoopRegions();
-    });
-  }
-
 public:
   VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands)
       : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9a5591feb3d05..ba145ffa0b681 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2523,51 +2523,32 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.
 
-  if (areAllOperandsInvariant()) {
-    // If we are vectorizing, but the GEP has only loop-invariant operands,
-    // the GEP we build (by only using vector-typed operands for
-    // loop-varying values) would be a scalar pointer. Thus, to ensure we
-    // produce a vector of pointers, we need to either arbitrarily pick an
-    // operand to broadcast, or broadcast a clone of the original GEP.
-    // Here, we broadcast a clone of the original.
-    //
-    // TODO: If at some point we decide to scalarize instructions having
-    //       loop-invariant operands, this special case will no longer be
-    //       required. We would add the scalarization decision to
-    //       collectLoopScalars() and teach getVectorValue() to broadcast
-    //       the lane-zero scalar value.
-    SmallVector<Value *> Ops;
-    for (unsigned I = 0, E = getNumOperands(); I != E; I++)
-      Ops.push_back(State.get(getOperand(I), VPLane(0)));
-
-    auto *NewGEP =
-        State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
-                                "", getGEPNoWrapFlags());
-    Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
-    State.set(this, Splat);
-  } else {
-    // If the GEP has at least one loop-varying operand, we are sure to
-    // produce a vector of pointers unless VF is scalar.
-    // The pointer operand of the new GEP. If it's loop-invariant, we
-    // won't broadcast it.
-    auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
-
-    // Collect all the indices for the new GEP. If any index is
-    // loop-invariant, we won't broadcast it.
-    SmallVector<Value *, 4> Indices;
-    for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
-      VPValue *Operand = getOperand(I);
-      Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
-    }
-
-    // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-    // but it should be a vector, otherwise.
-    auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
-                                           "", getGEPNoWrapFlags());
-    assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-           "NewGEP is not a pointer vector");
-    State.set(this, NewGEP);
-  }
+  assert(
+      any_of(operands(),
+             [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) &&
+      "Expected at least one loop-variant operand");
+
+  // If the GEP has at least one loop-varying operand, we are sure to
+  // produce a vector of pointers unless VF is scalar.
+  // The pointer operand of the new GEP. If it's loop-invariant, we
+  // won't broadcast it.
+  auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
+
+  // Collect all the indices for the new GEP. If any index is
+  // loop-invariant, we won't broadcast it.
+  SmallVector<Value *, 4> Indices;
+  for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+    VPValue *Operand = getOperand(I);
+    Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
+  }
+
+  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+  // but it should be a vector, otherwise.
+  auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
+                                         "", getGEPNoWrapFlags());
+  assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+         "NewGEP is not a pointer vector");
+  State.set(this, NewGEP);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eab642678702a..7eee51e9f2bbf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1391,7 +1391,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPWidenGEPRecipe,
+               VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 212a5c99676f4..877484f5159fd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -63,7 +63,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    store i32 [[STORE]], ptr [[NBRBOXES]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -114,7 +114,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    br label [[LOOP:%.*]]
 ; RVA23:       exit:
@@ -141,7 +141,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23ZVL1024B-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; RVA23ZVL1024B:       middle.block:
 ; RVA23ZVL1024B-NEXT:    br label [[LOOP:%.*]]
 ; RVA23ZVL1024B:       exit:
@@ -185,16 +185,16 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index d08ca8c99e8ba..c37bf74f9c1b0 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -8,14 +8,14 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,11 +55,11 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[N]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]

From efc0ab0f81451e495ae0f8a746ca6960dab0fb73 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 11 Nov 2025 10:43:31 +0000
Subject: [PATCH 90/97] [AArch64][CostModel] Add SVE bfloat arithmetic tests
 (NFC) (#166951)

This patch adds cost model tests for `bfloat` operations with
`+sve-b16b16`. Currently, some of these costs are higher than they
should be as the cost model is assuming `bfloat`s need promotion, but
some of these operations are natively supported with `+sve-b16b16`.
---
 .../CostModel/AArch64/sve-arith-fp.ll         | 130 +++++++++++++++++-
 1 file changed, 129 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
index ec848c2c08305..d9e26dc47b53f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s
+; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
+; RUN: opt < %s -enable-no-nans-fp-math -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve-b16b16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -31,6 +32,21 @@ define void @fadd() {
   ret void
 }
 
+define void @fadd_bf16() {
+; CHECK-LABEL: 'fadd_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
+
 define void @fsub() {
 ; CHECK-LABEL: 'fsub'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> poison, poison
@@ -59,6 +75,20 @@ define void @fsub() {
   ret void
 }
 
+define void @fsub_bf16() {
+; CHECK-LABEL: 'fsub_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
 define void @fneg() {
 ; CHECK-LABEL: 'fneg'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> poison
@@ -87,6 +117,22 @@ define void @fneg() {
   ret void
 }
 
+define void @fneg_bf16() {
+; CHECK-LABEL: 'fneg_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+  %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+  %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+  %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+
+  ret void
+}
+
 define void @fmul() {
 ; CHECK-LABEL: 'fmul'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> poison, poison
@@ -113,6 +159,20 @@ define void @fmul() {
   ret void
 }
 
+define void @fmul_bf16() {
+; CHECK-LABEL: 'fmul_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
 define void @fdiv() {
 ; CHECK-LABEL: 'fdiv'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> poison, poison
@@ -139,6 +199,20 @@ define void @fdiv() {
   ret void
 }
 
+define void @fdiv_bf16() {
+; CHECK-LABEL: 'fdiv_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
 define void @frem() {
 ; CHECK-LABEL: 'frem'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> poison, poison
@@ -165,6 +239,20 @@ define void @frem() {
   ret void
 }
 
+define void @frem_bf16() {
+; CHECK-LABEL: 'frem_bf16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+
+  ret void
+}
+
 define void @fma() {
 ; CHECK-LABEL: 'fma'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
@@ -191,6 +279,26 @@ define void @fma() {
   ret void
 }
 
+define void @fma_bf16() {
+; CHECK-BASE-LABEL: 'fma_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fma_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+
+  ret void
+}
+
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
@@ -216,3 +324,23 @@ define void @fmuladd() {
 
   ret void
 }
+
+define void @fmuladd_bf16() {
+; CHECK-BASE-LABEL: 'fmuladd_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 1 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fmuladd_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 2 for: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 4 for: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+
+  ret void
+}

From 0f4c8dd87af61ec5e254bec9f144e9dbc8bc2229 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 11 Nov 2025 10:41:21 +0000
Subject: [PATCH 91/97] [lldb][test] Fix ThreadStepUntilAPI.py

After #167359 / 95db31e7f69e7008f0c570ed30d54d1e418e10f2.

A fix was attempted in #167423 but was not quite enough.

From what I could understand, in v1 format you have to specify
all the basic blocks. Where before !call_me implied they were all
cold (I think, very shaky understanding here).

For this test we want to see blocks like call_me/foo/call_me.
So adding a line for block 1 fixes the tests.

It could produce more blocks at some point but I think as long
as foo is within two of them, it'll be fine.
---
 lldb/test/API/functionalities/thread/step_until/function.list | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
index 43c121eff9631..d8caa20ad3550 100644
--- a/lldb/test/API/functionalities/thread/step_until/function.list
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -1,3 +1,4 @@
 v1
 f call_me
 c 0
+c 1

From 3a7359545ae533a59783fe6f9815b421ca1443dd Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ron.lieberman@amd.com>
Date: Tue, 11 Nov 2025 04:54:50 -0600
Subject: [PATCH 92/97] Revert "Reland "[clang] Refactor option-related code
 from clangDriver into new clangOptions library" (#167374)"

This reverts commit f63d33da0a517a9a7096e0e67defb50c5995dd41.
---
 clang-tools-extra/clangd/CMakeLists.txt       |   1 -
 clang-tools-extra/clangd/CompileCommands.cpp  |  27 +-
 clang-tools-extra/modularize/CMakeLists.txt   |   1 -
 .../modularize/CoverageChecker.cpp            |   6 +-
 clang-tools-extra/modularize/Modularize.cpp   |   4 +-
 .../modularize/ModularizeUtilities.cpp        |   6 +-
 clang-tools-extra/pp-trace/CMakeLists.txt     |   1 -
 clang-tools-extra/pp-trace/PPTrace.cpp        |   2 +-
 clang/docs/CMakeLists.txt                     |   2 +-
 clang/docs/InternalsManual.rst                |   8 +-
 clang/docs/ReleaseNotes.rst                   |   3 -
 clang/include/clang/CMakeLists.txt            |   2 +-
 .../clang/{Options => Driver}/CMakeLists.txt  |   0
 .../{Options => Driver}/ClangOptionDocs.td    |   0
 clang/include/clang/Driver/Driver.h           |   2 +-
 .../clang/{Options => Driver}/OptionUtils.h   |   6 +-
 .../clang/{Options => Driver}/Options.h       |  20 +-
 .../clang/{Options => Driver}/Options.td      |   0
 clang/include/clang/Frontend/Utils.h          |   2 +-
 clang/include/module.modulemap                |   1 -
 clang/lib/CMakeLists.txt                      |   1 -
 clang/lib/Driver/CMakeLists.txt               |   3 +-
 clang/lib/Driver/Compilation.cpp              |   2 +-
 clang/lib/Driver/Driver.cpp                   |   3 +-
 .../lib/{Options => Driver}/DriverOptions.cpp |  19 +-
 clang/lib/{Options => Driver}/OptionUtils.cpp |   2 +-
 clang/lib/Driver/SanitizerArgs.cpp            |   2 +-
 clang/lib/Driver/ToolChain.cpp                |   6 +-
 clang/lib/Driver/ToolChains/AIX.cpp           |   6 +-
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |  21 +-
 clang/lib/Driver/ToolChains/AMDGPU.h          |   2 +-
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp  |   3 +-
 clang/lib/Driver/ToolChains/AVR.cpp           |   2 +-
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp  |   4 +-
 clang/lib/Driver/ToolChains/Arch/ARM.cpp      |   4 +-
 clang/lib/Driver/ToolChains/Arch/CSKY.cpp     |   6 +-
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |   5 +-
 clang/lib/Driver/ToolChains/Arch/M68k.cpp     |  16 +-
 clang/lib/Driver/ToolChains/Arch/Mips.cpp     |   5 +-
 clang/lib/Driver/ToolChains/Arch/PPC.cpp      |   2 +-
 clang/lib/Driver/ToolChains/Arch/RISCV.cpp    |   2 +-
 clang/lib/Driver/ToolChains/Arch/Sparc.cpp    |   4 +-
 clang/lib/Driver/ToolChains/Arch/SystemZ.cpp  |  10 +-
 clang/lib/Driver/ToolChains/Arch/VE.cpp       |   2 +-
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |  18 +-
 clang/lib/Driver/ToolChains/BareMetal.cpp     |   4 +-
 clang/lib/Driver/ToolChains/CSKYToolChain.cpp |   2 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  13 +-
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  24 +-
 clang/lib/Driver/ToolChains/CrossWindows.cpp  |   2 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          |   8 +-
 clang/lib/Driver/ToolChains/Cygwin.cpp        |   4 +-
 clang/lib/Driver/ToolChains/Darwin.cpp        |   6 +-
 clang/lib/Driver/ToolChains/DragonFly.cpp     |   4 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |   7 +-
 clang/lib/Driver/ToolChains/FreeBSD.cpp       |   4 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp       |   4 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |  10 +-
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   2 +-
 clang/lib/Driver/ToolChains/HIPSPV.cpp        |   4 +-
 clang/lib/Driver/ToolChains/HIPUtility.cpp    |   2 +-
 clang/lib/Driver/ToolChains/Hexagon.cpp       |   2 +-
 clang/lib/Driver/ToolChains/Hurd.cpp          |   4 +-
 clang/lib/Driver/ToolChains/Linux.cpp         |   4 +-
 clang/lib/Driver/ToolChains/MSP430.cpp        |   2 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |   2 +-
 clang/lib/Driver/ToolChains/Managarm.cpp      |   4 +-
 clang/lib/Driver/ToolChains/MinGW.cpp         |   2 +-
 clang/lib/Driver/ToolChains/MipsLinux.cpp     |   4 +-
 clang/lib/Driver/ToolChains/NetBSD.cpp        |   4 +-
 clang/lib/Driver/ToolChains/OHOS.cpp          |   4 +-
 clang/lib/Driver/ToolChains/OpenBSD.cpp       |   4 +-
 clang/lib/Driver/ToolChains/PPCFreeBSD.cpp    |   4 +-
 clang/lib/Driver/ToolChains/PPCLinux.cpp      |   4 +-
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |   2 +-
 clang/lib/Driver/ToolChains/SPIRV.cpp         |   2 +-
 clang/lib/Driver/ToolChains/SYCL.cpp          |   2 +-
 clang/lib/Driver/ToolChains/Solaris.cpp       |   4 +-
 clang/lib/Driver/ToolChains/UEFI.cpp          |   2 +-
 clang/lib/Driver/ToolChains/VEToolchain.cpp   |   6 +-
 clang/lib/Driver/ToolChains/WebAssembly.cpp   |   6 +-
 clang/lib/Driver/ToolChains/XCore.cpp         |   6 +-
 clang/lib/Driver/ToolChains/ZOS.cpp           |   2 +-
 clang/lib/Driver/XRayArgs.cpp                 |   2 +-
 clang/lib/Frontend/CMakeLists.txt             |   1 -
 clang/lib/Frontend/CompilerInvocation.cpp     |  54 +-
 .../CreateInvocationFromCommandLine.cpp       |   6 +-
 clang/lib/FrontendTool/CMakeLists.txt         |   1 -
 .../ExecuteCompilerInvocation.cpp             |   6 +-
 clang/lib/Interpreter/Interpreter.cpp         |   4 +-
 clang/lib/Interpreter/InterpreterUtils.h      |   2 +-
 clang/lib/Options/CMakeLists.txt              |  18 -
 clang/lib/Tooling/CMakeLists.txt              |   1 -
 .../InterpolatingCompilationDatabase.cpp      |  12 +-
 clang/lib/Tooling/Tooling.cpp                 |  12 +-
 clang/tools/clang-check/CMakeLists.txt        |   1 -
 clang/tools/clang-check/ClangCheck.cpp        |   4 +-
 clang/tools/clang-installapi/CMakeLists.txt   |   1 -
 .../clang-installapi/ClangInstallAPI.cpp      |   4 +-
 clang/tools/clang-installapi/Options.cpp      |  55 +-
 clang/tools/driver/CMakeLists.txt             |   1 -
 clang/tools/driver/cc1_main.cpp               |   2 +-
 clang/tools/driver/cc1as_main.cpp             |   8 +-
 clang/tools/driver/driver.cpp                 |   2 +-
 clang/unittests/Driver/DXCModeTest.cpp        |   4 +-
 clang/www/OpenProjects.html                   |   2 +-
 flang/docs/CMakeLists.txt                     |   2 +-
 flang/docs/FlangDriver.md                     |   8 +-
 flang/lib/Frontend/CMakeLists.txt             |   1 -
 flang/lib/Frontend/CompilerInvocation.cpp     | 553 ++++++++++--------
 flang/lib/FrontendTool/CMakeLists.txt         |   1 -
 .../ExecuteCompilerInvocation.cpp             |   6 +-
 flang/tools/flang-driver/CMakeLists.txt       |   1 -
 flang/tools/flang-driver/driver.cpp           |   4 +-
 .../Platform/MacOSX/PlatformDarwin.cpp        |   4 +-
 llvm/include/llvm/MC/MCAsmInfo.h              |   2 +-
 llvm/include/llvm/Option/Arg.h                |   2 +-
 revert_patches.txt                            |   3 +
 .../llvm-project-overlay/clang/BUILD.bazel    |   4 +-
 119 files changed, 623 insertions(+), 591 deletions(-)
 rename clang/include/clang/{Options => Driver}/CMakeLists.txt (100%)
 rename clang/include/clang/{Options => Driver}/ClangOptionDocs.td (100%)
 rename clang/include/clang/{Options => Driver}/OptionUtils.h (94%)
 rename clang/include/clang/{Options => Driver}/Options.h (83%)
 rename clang/include/clang/{Options => Driver}/Options.td (100%)
 rename clang/lib/{Options => Driver}/DriverOptions.cpp (76%)
 rename clang/lib/{Options => Driver}/OptionUtils.cpp (97%)
 delete mode 100644 clang/lib/Options/CMakeLists.txt

diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index d7ec853af862f..fb3f05329be21 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -165,7 +165,6 @@ clang_target_link_libraries(clangDaemon
   clangBasic
   clangDependencyScanning
   clangDriver
-  clangOptions
   clangFormat
   clangFrontend
   clangIndex
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index 7990f2719e9a0..c1be93730129a 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -11,8 +11,8 @@
 #include "support/Logger.h"
 #include "support/Trace.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInvocation.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   if (Cmd.empty())
     return;
 
-  auto &OptTable = getDriverOptTable();
+  auto &OptTable = clang::driver::getDriverOptTable();
   // OriginalArgs needs to outlive ArgList.
   llvm::SmallVector<const char *, 16> OriginalArgs;
   OriginalArgs.reserve(Cmd.size());
@@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   llvm::opt::InputArgList ArgList;
   ArgList = OptTable.ParseArgs(
       llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount,
-      llvm::opt::Visibility(IsCLMode ? options::CLOption
-                                     : options::ClangOption));
+      llvm::opt::Visibility(IsCLMode ? driver::options::CLOption
+                                     : driver::options::ClangOption));
 
   llvm::SmallVector<unsigned, 1> IndicesToDrop;
   // Having multiple architecture options (e.g. when building fat binaries)
@@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // As there are no signals to figure out which one user actually wants. They
   // can explicitly specify one through `CompileFlags.Add` if need be.
   unsigned ArchOptCount = 0;
-  for (auto *Input : ArgList.filtered(options::OPT_arch)) {
+  for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) {
     ++ArchOptCount;
     for (auto I = 0U; I <= Input->getNumValues(); ++I)
       IndicesToDrop.push_back(Input->getIndex() + I);
@@ -262,12 +262,13 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   // explicitly at the end of the flags. This ensures modifications done in the
   // following steps apply in more cases (like setting -x, which only affects
   // inputs that come after it).
-  for (auto *Input : ArgList.filtered(options::OPT_INPUT)) {
+  for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) {
     SawInput(Input->getValue(0));
     IndicesToDrop.push_back(Input->getIndex());
   }
   // Anything after `--` is also treated as input, drop them as well.
-  if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) {
+  if (auto *DashDash =
+          ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) {
     auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0]
     // Another +1 so we don't treat the `--` itself as an input.
     for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I)
@@ -423,11 +424,11 @@ DriverMode getDriverMode(const std::vector<std::string> &Args) {
 // Returns the set of DriverModes where an option may be used.
 unsigned char getModes(const llvm::opt::Option &Opt) {
   unsigned char Result = DM_None;
-  if (Opt.hasVisibilityFlag(options::ClangOption))
+  if (Opt.hasVisibilityFlag(driver::options::ClangOption))
     Result |= DM_GCC;
-  if (Opt.hasVisibilityFlag(options::CC1Option))
+  if (Opt.hasVisibilityFlag(driver::options::CC1Option))
     Result |= DM_CC1;
-  if (Opt.hasVisibilityFlag(options::CLOption))
+  if (Opt.hasVisibilityFlag(driver::options::CLOption))
     Result |= DM_CL;
   return Result;
 }
@@ -441,8 +442,8 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
   using TableTy =
       llvm::StringMap<llvm::SmallVector<Rule, 4>, llvm::BumpPtrAllocator>;
   static TableTy *Table = [] {
-    auto &DriverTable = getDriverOptTable();
-    using DriverID = clang::options::ID;
+    auto &DriverTable = driver::getDriverOptTable();
+    using DriverID = clang::driver::options::ID;
 
     // Collect sets of aliases, so we can treat -foo and -foo= as synonyms.
     // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I].
@@ -467,7 +468,7 @@ llvm::ArrayRef<ArgStripper::Rule> ArgStripper::rulesFor(llvm::StringRef Arg) {
                FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS,       \
                METAVAR, VALUES, SUBCOMMANDIDS_OFFSET)                          \
   {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS},
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTION
     };
     for (auto &E : AliasTable)
diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt
index a775b790a3147..eb5383c3ad44e 100644
--- a/clang-tools-extra/modularize/CMakeLists.txt
+++ b/clang-tools-extra/modularize/CMakeLists.txt
@@ -20,7 +20,6 @@ clang_target_link_libraries(modularize
   clangAST
   clangBasic
   clangDriver
-  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index d80d78c64c6e2..1345a6ef8f489 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -50,18 +50,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CoverageChecker.h"
 #include "ModularizeUtilities.h"
 #include "clang/AST/ASTConsumer.h"
+#include "CoverageChecker.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Option.h"
@@ -73,7 +73,7 @@
 using namespace Modularize;
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::options;
+using namespace clang::driver::options;
 using namespace clang::tooling;
 namespace cl = llvm::cl;
 namespace sys = llvm::sys;
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 33966b44f719a..376ad0c7875bf 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -231,11 +231,11 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
@@ -254,7 +254,7 @@
 
 using namespace clang;
 using namespace clang::driver;
-using namespace clang::options;
+using namespace clang::driver::options;
 using namespace clang::tooling;
 using namespace llvm;
 using namespace llvm::opt;
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 6978a6b2fe1b7..4dd84feac5df4 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -12,17 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ModularizeUtilities.h"
-#include "CoverageChecker.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
-#include "clang/Options/Options.h"
+#include "CoverageChecker.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "ModularizeUtilities.h"
 
 using namespace clang;
 using namespace llvm;
diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt
index da36582ee0234..1323adbc35269 100644
--- a/clang-tools-extra/pp-trace/CMakeLists.txt
+++ b/clang-tools-extra/pp-trace/CMakeLists.txt
@@ -14,7 +14,6 @@ clang_target_link_libraries(pp-trace
   PRIVATE
   clangAST
   clangBasic
-  clangOptions
   clangFrontend
   clangLex
   clangSerialization
diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp
index ba5a06a26830d..0b078c49a55b7 100644
--- a/clang-tools-extra/pp-trace/PPTrace.cpp
+++ b/clang-tools-extra/pp-trace/PPTrace.cpp
@@ -28,11 +28,11 @@
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Lex/Preprocessor.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/Execution.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/Option/Arg.h"
diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt
index 9469a832adb62..1f06c040c96cb 100644
--- a/clang/docs/CMakeLists.txt
+++ b/clang/docs/CMakeLists.txt
@@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX)
     # Generated files
     gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}")
     gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}")
-    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}")
+    gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}")
 
     # Another generated file from a different source
     set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools)
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index a849d05eb7ae9..eff46ab46e1ca 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -667,7 +667,7 @@ Command Line Interface
 ----------------------
 
 The command line interface of the Clang ``-cc1`` frontend is defined alongside
-the driver options in ``clang/Options/Options.td``. The information making up an
+the driver options in ``clang/Driver/Options.td``. The information making up an
 option definition includes its prefix and name (for example ``-std=``), form and
 position of the option value, help text, aliases and more. Each option may
 belong to a certain group and can be marked with zero or more flags. Options
@@ -712,7 +712,7 @@ variable for the option value:
     }
 
 Next, declare the command line interface of the option in the tablegen file
-``clang/include/clang/Options/Options.td``. This is done by instantiating the
+``clang/include/clang/Driver/Options.td``. This is done by instantiating the
 ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The
 instance is typically created through one of the helper classes that encode the
 acceptable ways to specify the option value on the command line:
@@ -906,7 +906,7 @@ command line:
                                   SHOULD_PARSE, KEYPATH, DEFAULT_VALUE,          \
                                   IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER,      \
                                   MERGER, TABLE_INDEX)
-  #include "clang/Options/Options.inc"
+  #include "clang/Driver/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
@@ -925,7 +925,7 @@ command line:
     GENERATE_OPTION_WITH_MARSHALLING(                                            \
         Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE,    \
         IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX)
-  #include "clang/Options/Options.inc"
+  #include "clang/Driver/Options.inc"
   #undef LANG_OPTION_WITH_MARSHALLING
 
     // ...
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9d3ac48d2e0ca..5f356daec2d04 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -79,9 +79,6 @@ Potentially Breaking Changes
     void foo(void) {
       return ({ 1;; });
     }
-- Downstream projects that previously linked only against ``clangDriver`` may
-  now (also) need to link against the new ``clangOptions`` library, since
-  options-related code has been moved out of the Driver into a separate library.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt
index 77a44e4c48de5..47ac70cd21690 100644
--- a/clang/include/clang/CMakeLists.txt
+++ b/clang/include/clang/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(Basic)
 if(CLANG_ENABLE_CIR)
   add_subdirectory(CIR)
 endif()
-add_subdirectory(Options)
+add_subdirectory(Driver)
 add_subdirectory(Parse)
 add_subdirectory(Sema)
 add_subdirectory(Serialization)
diff --git a/clang/include/clang/Options/CMakeLists.txt b/clang/include/clang/Driver/CMakeLists.txt
similarity index 100%
rename from clang/include/clang/Options/CMakeLists.txt
rename to clang/include/clang/Driver/CMakeLists.txt
diff --git a/clang/include/clang/Options/ClangOptionDocs.td b/clang/include/clang/Driver/ClangOptionDocs.td
similarity index 100%
rename from clang/include/clang/Options/ClangOptionDocs.td
rename to clang/include/clang/Driver/ClangOptionDocs.td
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 297afe26812a0..419089731569b 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -15,11 +15,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/InputInfo.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/clang/Options/OptionUtils.h b/clang/include/clang/Driver/OptionUtils.h
similarity index 94%
rename from clang/include/clang/Options/OptionUtils.h
rename to clang/include/clang/Driver/OptionUtils.h
index 83c48bd7d6843..922f536bf33ea 100644
--- a/clang/include/clang/Options/OptionUtils.h
+++ b/clang/include/clang/Driver/OptionUtils.h
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H
-#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H
+#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H
+#define LLVM_CLANG_DRIVER_OPTIONUTILS_H
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
@@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args,
 
 } // namespace clang
 
-#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H
+#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H
diff --git a/clang/include/clang/Options/Options.h b/clang/include/clang/Driver/Options.h
similarity index 83%
rename from clang/include/clang/Options/Options.h
rename to clang/include/clang/Driver/Options.h
index ac98699001965..0797410e9940e 100644
--- a/clang/include/clang/Options/Options.h
+++ b/clang/include/clang/Driver/Options.h
@@ -6,13 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H
-#define LLVM_CLANG_OPTIONS_OPTIONS_H
+#ifndef LLVM_CLANG_DRIVER_OPTIONS_H
+#define LLVM_CLANG_DRIVER_OPTIONS_H
 
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 
 namespace clang {
+namespace driver {
 
 namespace options {
 /// Flags specifically for clang options.  Must not overlap with
@@ -41,15 +42,16 @@ enum ClangVisibility {
 };
 
 enum ID {
-  OPT_INVALID = 0, // This is not an option ID.
+    OPT_INVALID = 0, // This is not an option ID.
 #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
-#include "clang/Options/Options.inc"
-  LastOption
+#include "clang/Driver/Options.inc"
+    LastOption
 #undef OPTION
-};
-} // namespace options
+  };
+}
 
 const llvm::opt::OptTable &getDriverOptTable();
-} // namespace clang
+}
+}
 
-#endif // LLVM_CLANG_OPTIONS_OPTIONS_H
+#endif
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Driver/Options.td
similarity index 100%
rename from clang/include/clang/Options/Options.td
rename to clang/include/clang/Driver/Options.td
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index ed2703c76f18d..49fd920d1ec43 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -15,8 +15,8 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Driver/OptionUtils.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
-#include "clang/Options/OptionUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index a11c8683c601e..c5535262ae38c 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -146,7 +146,6 @@ module Clang_Lex {
   module * { export * }
 }
 
-module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } }
 module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } }
 module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } }
 module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } }
diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt
index e90b009da606a..4f2218b583e41 100644
--- a/clang/lib/CMakeLists.txt
+++ b/clang/lib/CMakeLists.txt
@@ -13,7 +13,6 @@ add_subdirectory(Edit)
 add_subdirectory(ExtractAPI)
 add_subdirectory(Rewrite)
 add_subdirectory(Driver)
-add_subdirectory(Options)
 add_subdirectory(Serialization)
 add_subdirectory(Frontend)
 add_subdirectory(FrontendTool)
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 8577b3af257a7..a8d83d38264a7 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -20,10 +20,12 @@ add_clang_library(clangDriver
   Compilation.cpp
   Distro.cpp
   Driver.cpp
+  DriverOptions.cpp
   Job.cpp
   Multilib.cpp
   MultilibBuilder.cpp
   OffloadBundler.cpp
+  OptionUtils.cpp
   Phases.cpp
   SanitizerArgs.cpp
   Tool.cpp
@@ -99,7 +101,6 @@ add_clang_library(clangDriver
   LINK_LIBS
   clangBasic
   clangLex
-  clangOptions
   ${system_libs}
   ${LLVM_PTHREAD_LIB}
   )
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 4a75ae2c74c51..665d81f99ba45 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -11,9 +11,9 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/Option.h"
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index fb892ae02be9a..89c4b7363f21c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -60,6 +60,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/OptionUtils.h"
 #include "clang/Driver/Phases.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Tool.h"
@@ -67,7 +69,6 @@
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
 #include "clang/Lex/DependencyDirectivesScanner.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/clang/lib/Options/DriverOptions.cpp b/clang/lib/Driver/DriverOptions.cpp
similarity index 76%
rename from clang/lib/Options/DriverOptions.cpp
rename to clang/lib/Driver/DriverOptions.cpp
index d91e9291fb2f6..cde1f8989935b 100644
--- a/clang/lib/Options/DriverOptions.cpp
+++ b/clang/lib/Driver/DriverOptions.cpp
@@ -6,32 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/OptTable.h"
 #include <cassert>
 
-using namespace clang::options;
+using namespace clang::driver;
+using namespace clang::driver::options;
 using namespace llvm::opt;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 #define OPTTABLE_VALUES_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_VALUES_CODE
 
 #define OPTTABLE_PREFIXES_TABLE_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_PREFIXES_TABLE_CODE
 
 #define OPTTABLE_PREFIXES_UNION_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_PREFIXES_UNION_CODE
 
 static constexpr OptTable::Info InfoTable[] = {
 #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTION
 };
 
@@ -43,9 +44,9 @@ class DriverOptTable : public PrecomputedOptTable {
       : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable,
                             OptionPrefixesUnion) {}
 };
-} // anonymous namespace
+}
 
-const llvm::opt::OptTable &clang::getDriverOptTable() {
+const llvm::opt::OptTable &clang::driver::getDriverOptTable() {
   static DriverOptTable Table;
   return Table;
 }
diff --git a/clang/lib/Options/OptionUtils.cpp b/clang/lib/Driver/OptionUtils.cpp
similarity index 97%
rename from clang/lib/Options/OptionUtils.cpp
rename to clang/lib/Driver/OptionUtils.cpp
index fcafd3c83c6b3..1f36ffc03cab3 100644
--- a/clang/lib/Options/OptionUtils.cpp
+++ b/clang/lib/Driver/OptionUtils.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Options/OptionUtils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticDriver.h"
+#include "clang/Driver/OptionUtils.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang;
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 420c4cddbc8dd..5dd48f53b9069 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 5ee53c01d6e30..3452ce5339174 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -21,9 +21,9 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/XRayArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
 
 Multilib::flags_list
 ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
-  using namespace clang::options;
+  using namespace clang::driver::options;
 
   std::vector<std::string> Result;
   const llvm::Triple Triple(ComputeEffectiveClangTriple(Args));
@@ -1816,7 +1816,7 @@ void ToolChain::TranslateXarchArgs(
   unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos));
   unsigned Prev = Index;
   std::unique_ptr<llvm::opt::Arg> XarchArg(Opts.ParseOneArg(
-      Args, Index, llvm::opt::Visibility(options::ClangOption)));
+      Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption)));
 
   // If the argument parsing failed or more than one argument was
   // consumed, the -Xarch_ argument's parameter tried to consume
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 21e7dcad3cdd0..ffd7b69205440 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -9,8 +9,8 @@
 #include "AIX.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -19,7 +19,6 @@
 #include <set>
 
 using AIX = clang::driver::toolchains::AIX;
-using namespace clang;
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
@@ -168,7 +167,8 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
        Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-bdbg:namedsects:ss");
 
-  if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) {
+  if (Arg *A =
+          Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) {
     StringRef BuildId = A->getValue();
     if (BuildId[0] != '0' || BuildId[1] != 'x' ||
         BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 3b5411179349b..edfc62f0ad7d5 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
@@ -324,24 +324,27 @@ RocmInstallationDetector::RocmInstallationDetector(
     const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib)
     : D(D) {
   Verbose = Args.hasArg(options::OPT_v);
-  RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ);
-  PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs);
+  RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ);
+  PrintROCmSearchDirs =
+      Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs);
   RocmDeviceLibPathArg =
-      Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ);
-  HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ);
-  HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ);
+      Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ);
+  HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ);
+  HIPStdParPathArg =
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ);
   HasHIPStdParLibrary =
     !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg +
                                                    "/hipstdpar_lib.hpp");
   HIPRocThrustPathArg =
-      Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ);
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ);
   HasRocThrustLibrary = !HIPRocThrustPathArg.empty() &&
                         D.getVFS().exists(HIPRocThrustPathArg + "/thrust");
-  HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ);
+  HIPRocPrimPathArg =
+    Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ);
   HasRocPrimLibrary = !HIPRocPrimPathArg.empty() &&
                       D.getVFS().exists(HIPRocPrimPathArg + "/rocprim");
 
-  if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) {
+  if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) {
     HIPVersionArg = A->getValue();
     unsigned Major = ~0U;
     unsigned Minor = ~0U;
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index c8601d1dedbcb..7185b24aec0f8 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -11,9 +11,9 @@
 
 #include "Gnu.h"
 #include "clang/Basic/TargetID.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/TargetParser/TargetParser.h"
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 324369bfa2cc1..e73b4f02cac39 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -11,8 +11,9 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Tool.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 588255dc5a0cd..731076d9754a9 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index d6fb2a57539ed..e8d5e38a9064f 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -9,7 +9,7 @@
 #include "AArch64.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Host.h"
@@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     // Default to 'A' profile if the architecture is not specified.
     success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions);
 
-  if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
+  if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)))
     success =
         getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features);
   else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 55eb2dcf7ddf4..61beb0455147d 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -8,7 +8,7 @@
 
 #include "ARM.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) {
 // Get Arch/CPU from args.
 void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch,
                                 llvm::StringRef &CPU, bool FromAs) {
-  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
     CPU = A->getValue();
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     Arch = A->getValue();
diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
index 65f6534e4d038..2fd2c72147f5b 100644
--- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp
@@ -8,7 +8,7 @@
 
 #include "CSKY.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/CSKYTargetParser.h"
@@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args,
     return std::optional<llvm::StringRef>(A->getValue());
   }
 
-  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue());
     if (ArchKind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
@@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     archName = A->getValue();
   }
 
-  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
     llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue());
     if (Kind == llvm::CSKY::ArchKind::INVALID) {
       D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index da084bdabaee3..156ea03045569 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -11,7 +11,7 @@
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/LoongArchTargetParser.h"
 
@@ -130,7 +130,8 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
   // Enable the `lsx` feature on 64-bit LoongArch by default.
-  if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ)))
+  if (Triple.isLoongArch64() &&
+      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
     Features.push_back("+lsx");
 
   // -mrelax is default, unless -mno-relax is specified.
diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
index a620597f10475..708ec84a37cfb 100644
--- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp
@@ -8,7 +8,7 @@
 
 #include "M68k.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Regex.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting.
 std::string m68k::getM68kTargetCPU(const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
     // The canonical CPU name is captalize. However, we allow
     // starting with lower case or numbers only
     StringRef CPUName = A->getValue();
@@ -45,17 +45,17 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) {
         .Default(CPUName.str());
   }
   // FIXME: Throw error when multiple sub-architecture flag exist
-  if (Args.hasArg(options::OPT_m68000))
+  if (Args.hasArg(clang::driver::options::OPT_m68000))
     return "M68000";
-  if (Args.hasArg(options::OPT_m68010))
+  if (Args.hasArg(clang::driver::options::OPT_m68010))
     return "M68010";
-  if (Args.hasArg(options::OPT_m68020))
+  if (Args.hasArg(clang::driver::options::OPT_m68020))
     return "M68020";
-  if (Args.hasArg(options::OPT_m68030))
+  if (Args.hasArg(clang::driver::options::OPT_m68030))
     return "M68030";
-  if (Args.hasArg(options::OPT_m68040))
+  if (Args.hasArg(clang::driver::options::OPT_m68040))
     return "M68040";
-  if (Args.hasArg(options::OPT_m68060))
+  if (Args.hasArg(clang::driver::options::OPT_m68060))
     return "M68060";
 
   return "";
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 103aae7018fbf..8d7b85dbeed99 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -9,7 +9,7 @@
 #include "Mips.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 
@@ -49,7 +49,8 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
     DefMips64CPU = "mips3";
   }
 
-  if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ))
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ,
+                               options::OPT_mcpu_EQ))
     CPUName = A->getValue();
 
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
index 44afdd249fea5..361a68a892a8f 100644
--- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp
@@ -9,7 +9,7 @@
 #include "PPC.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index 1dcce6d053a39..f2e79e71f93d4 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -10,7 +10,7 @@
 #include "../Clang.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Host.h"
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 49256d80cbdf6..94a94f1e9c487 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -8,7 +8,7 @@
 
 #include "Sparc.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
@@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D,
 
 std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
                                      const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) {
     StringRef CPUName = A->getValue();
     if (CPUName == "native") {
       std::string CPU = std::string(llvm::sys::getHostCPUName());
diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
index 1ef6a725483e8..75b6afd925245 100644
--- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp
@@ -8,7 +8,7 @@
 
 #include "SystemZ.h"
 #include "clang/Config/config.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
     D.Diag(diag::err_drv_unsupported_opt)
       << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args);
 
-  if (Arg *A =
-          Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float))
-    if (A->getOption().matches(options::OPT_msoft_float))
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float,
+                               options::OPT_mhard_float))
+    if (A->getOption().matches(clang::driver::options::OPT_msoft_float))
       ABI = systemz::FloatABI::Soft;
 
   return ABI;
@@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D,
 
 std::string systemz::getSystemZTargetCPU(const ArgList &Args,
                                          const llvm::Triple &T) {
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
     llvm::StringRef CPUName = A->getValue();
 
     if (CPUName == "native") {
diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp
index c8353d7dc5f3a..adc0873586588 100644
--- a/clang/lib/Driver/ToolChains/Arch/VE.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp
@@ -8,7 +8,7 @@
 
 #include "VE.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang::driver;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 092069b6ade56..1373905a5120e 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -8,7 +8,7 @@
 
 #include "X86.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Option/ArgList.h"
@@ -21,7 +21,7 @@ using namespace llvm::opt;
 
 std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args,
                                  const llvm::Triple &Triple) {
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
     StringRef CPU = A->getValue();
     if (CPU != "native")
       return std::string(CPU);
@@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                std::vector<StringRef> &Features) {
   // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or
   // "ms_abi" as default function attributes.
-  if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
     StringRef DefaultAbi =
         (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv";
     if (A->getValue() != DefaultAbi)
@@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   }
 
   // If -march=native, autodetect the feature list.
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) {
     if (StringRef(A->getValue()) == "native") {
       for (auto &F : llvm::sys::getHostCPUFeatures())
         Features.push_back(
@@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
   // flags). This is a bit hacky but keeps existing usages working. We should
   // consider deprecating this and instead warn if the user requests external
   // retpoline thunks and *doesn't* request some form of retpolines.
-  auto SpectreOpt = options::ID::OPT_INVALID;
+  auto SpectreOpt = clang::driver::options::ID::OPT_INVALID;
   if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline,
                          options::OPT_mspeculative_load_hardening,
                          options::OPT_mno_speculative_load_hardening)) {
@@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     SpectreOpt = options::OPT_mretpoline_external_thunk;
   }
 
-  auto LVIOpt = options::ID::OPT_INVALID;
+  auto LVIOpt = clang::driver::options::ID::OPT_INVALID;
   if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening,
                    false)) {
     Features.push_back("+lvi-load-hardening");
@@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
           << D.getOpts().getOptionName(options::OPT_mlvi_hardening)
           << D.getOpts().getOptionName(options::OPT_m_seses);
 
-    if (SpectreOpt != options::ID::OPT_INVALID)
+    if (SpectreOpt != clang::driver::options::ID::OPT_INVALID)
       D.Diag(diag::err_drv_argument_not_allowed_with)
           << D.getOpts().getOptionName(SpectreOpt)
           << D.getOpts().getOptionName(options::OPT_m_seses);
@@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     }
   }
 
-  if (SpectreOpt != options::ID::OPT_INVALID &&
-      LVIOpt != options::ID::OPT_INVALID) {
+  if (SpectreOpt != clang::driver::options::ID::OPT_INVALID &&
+      LVIOpt != clang::driver::options::ID::OPT_INVALID) {
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << D.getOpts().getOptionName(SpectreOpt)
         << D.getOpts().getOptionName(LVIOpt);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 8d598be9ffb0a..9b7f58c392885 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -18,7 +18,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
@@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
 bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
                                     const llvm::opt::ArgList &Args) {
   if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) {
+      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
     GCCInstallation.init(Triple, Args);
     return GCCInstallation.isValid();
   }
diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
index c561d7d38da5b..e4db3307ee3aa 100644
--- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
+++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5c5609a3c8377..c5d8dea358c0a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -30,10 +30,10 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/InputInfo.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/XRayArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
@@ -66,7 +66,7 @@ using namespace clang;
 using namespace llvm::opt;
 
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC,
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
                                options::OPT_fminimize_whitespace,
                                options::OPT_fno_minimize_whitespace,
                                options::OPT_fkeep_system_includes,
@@ -1700,7 +1700,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
 
   AddAAPCSVolatileBitfieldArgs(Args, CmdArgs);
 
-  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
     CmdArgs.push_back("-tune-cpu");
     if (strcmp(A->getValue(), "native") == 0)
       CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName()));
@@ -2106,7 +2106,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args,
     CmdArgs.push_back("hard");
   }
 
-  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
     std::string TuneCPU;
     if (Name == "native")
@@ -2212,11 +2212,12 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
 
   // Default to "generic" unless -march is present or targetting the PS4/PS5.
   std::string TuneCPU;
-  if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS())
+  if (!Args.hasArg(clang::driver::options::OPT_march_EQ) &&
+      !getToolChain().getTriple().isPS())
     TuneCPU = "generic";
 
   // Override based on -mtune.
-  if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) {
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
     StringRef Name = A->getValue();
 
     if (Name == "native") {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index d4f87dec572d9..52e43d1a003eb 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -31,11 +31,11 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Driver/Util.h"
 #include "clang/Driver/XRayArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -88,7 +88,8 @@ static bool addRPathCmdArg(const llvm::opt::ArgList &Args,
 
 static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args,
                                               const llvm::Triple &Triple) {
-  if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry))
+  if (Args.hasArg(clang::driver::options::OPT_pg) &&
+      !Args.hasArg(clang::driver::options::OPT_mfentry))
     return true;
 
   if (Triple.isAndroid())
@@ -267,16 +268,17 @@ getFramePointerKind(const llvm::opt::ArgList &Args,
   // without requiring new frame records to be created.
 
   bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple);
-  bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) ||
-                  Args.hasFlag(options::OPT_fno_omit_frame_pointer,
-                               options::OPT_fomit_frame_pointer, DefaultFP);
+  bool EnableFP =
+      mustUseNonLeafFramePointerForTarget(Triple) ||
+      Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer,
+                   clang::driver::options::OPT_fomit_frame_pointer, DefaultFP);
 
   bool DefaultLeafFP =
       useLeafFramePointerForTargetByDefault(Triple) ||
       (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple));
-  bool EnableLeafFP =
-      Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer,
-                   options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
+  bool EnableLeafFP = Args.hasFlag(
+      clang::driver::options::OPT_mno_omit_leaf_frame_pointer,
+      clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP);
 
   bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple);
 
@@ -775,7 +777,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args,
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
-    if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
+    if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ))
       return std::string(
           llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue()));
     return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T));
@@ -1867,7 +1869,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
     if (SanArgs.needsFuzzerInterceptors())
       addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false,
                           true);
-    if (!Args.hasArg(options::OPT_nostdlibxx)) {
+    if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) {
       bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                  !Args.hasArg(options::OPT_static);
       if (OnlyLibstdcxxStatic)
@@ -3583,7 +3585,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args,
 // Otherwise, return an empty string and issue a diagnosic message if needed.
 StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
                                                const llvm::opt::ArgList &Args) {
-  Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
+  Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ);
   if (!A)
     return "";
 
diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp
index 6df5315e8fff8..51c892fc91718 100644
--- a/clang/lib/Driver/ToolChains/CrossWindows.cpp
+++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp
@@ -10,8 +10,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index de3cf7abbdec5..8fc1bded2acea 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -15,7 +15,7 @@
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
@@ -154,16 +154,16 @@ CudaInstallationDetector::CudaInstallationDetector(
   std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
   auto &FS = D.getVFS();
 
-  if (Args.hasArg(options::OPT_cuda_path_EQ)) {
+  if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
     Candidates.emplace_back(
-        Args.getLastArgValue(options::OPT_cuda_path_EQ).str());
+        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
   } else if (HostTriple.isOSWindows()) {
     for (const char *Ver : Versions)
       Candidates.emplace_back(
           D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
           Ver);
   } else {
-    if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) {
+    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
       // Try to find ptxas binary. If the executable is located in a directory
       // called 'bin/', its parent directory might be a good guess for a valid
       // CUDA installation.
diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp
index 55438125ce0f1..d9c16347daa34 100644
--- a/clang/lib/Driver/ToolChains/Cygwin.cpp
+++ b/clang/lib/Driver/ToolChains/Cygwin.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index fc3cd9030f71d..2fb7652d64536 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const {
 
   case llvm::Triple::thumb:
   case llvm::Triple::arm:
-    if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
+    if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ))
       if (const char *Arch = ArmMachOArchName(A->getValue()))
         return Arch;
 
@@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args,
   if (!BoundArch.empty()) {
     StringRef Name = BoundArch;
     const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ);
-    const Option MArch = Opts.getOption(options::OPT_march_EQ);
+    const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ);
 
     // This code must be kept in sync with LLVM's getArchTypeForDarwinArch,
     // which defines the list of which architectures we accept.
diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index d4a6d6ae3e349..524f5f2ff391e 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index c6038ac85514c..f24dd3982eab7 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -11,7 +11,7 @@
 
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/CommonArgs.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/Host.h"
@@ -236,8 +236,9 @@ void Flang::addCodegenOptions(const ArgList &Args,
        options::OPT_frepack_arrays_contiguity_EQ,
        options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays,
        options::OPT_ftime_report, options::OPT_ftime_report_EQ,
-       options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
-  if (Args.hasArg(options::OPT_fcoarray))
+       options::OPT_funroll_loops, options::OPT_fno_unroll_loops,
+       options::OPT_fdefer_desc_map, options::OPT_fno_defer_desc_map});
+  if (Args.hasArg(clang::driver::options::OPT_fcoarray))
     CmdArgs.push_back("-fcoarray");
 }
 
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 70e66a2f5c3e7..b17b76233ad30 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 9edfc4de3d602..507cc03b27513 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const {
 
 ToolChain::RuntimeLibType
 Fuchsia::GetRuntimeLibType(const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index d56cb02e14ca9..b7cd5f6be14a6 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -20,9 +20,9 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/MultilibBuilder.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/ArgList.h"
@@ -2104,7 +2104,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) {
 
 static llvm::StringRef getGCCToolchainDir(const ArgList &Args,
                                           llvm::StringRef SysRoot) {
-  const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain);
+  const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain);
   if (A)
     return A->getValue();
 
@@ -2157,7 +2157,8 @@ void Generic_GCC::GCCInstallationDetector::init(
                            CandidateBiarchTripleAliases);
 
   // If --gcc-install-dir= is specified, skip filesystem detection.
-  if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ);
+  if (const Arg *A =
+          Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ);
       A && A->getValue()[0]) {
     StringRef InstallDir = A->getValue();
     if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) {
@@ -2180,7 +2181,8 @@ void Generic_GCC::GCCInstallationDetector::init(
 
   // If --gcc-triple is specified use this instead of trying to
   // auto-detect a triple.
-  if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) {
+  if (const Arg *A =
+          Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) {
     StringRef GCCTriple = A->getValue();
     CandidateTripleAliases.clear();
     CandidateTripleAliases.push_back(GCCTriple);
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index b1c30beae1d35..d0d2d2e34b602 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -15,8 +15,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/TargetParser/TargetParser.h"
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index 44265e8a8b95f..fb738577c4c44 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -213,7 +213,7 @@ HIPSPVToolChain::getDeviceLibs(
   // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH.
   auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues(
       // --hip-device-lib-path is alias to this option.
-      options::OPT_rocm_device_lib_path_EQ);
+      clang::driver::options::OPT_rocm_device_lib_path_EQ);
   for (auto Path : HipDeviceLibPathArgs)
     LibraryPaths.push_back(DriverArgs.MakeArgString(Path));
 
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index 1af2ae6470f1e..732403e69a075 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -9,7 +9,7 @@
 #include "HIPUtility.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Archive.h"
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 084f51721315c..9f8b676fc7dc2 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -11,7 +11,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp
index 53ee4d4c0cbde..43121233ea7d0 100644
--- a/clang/lib/Driver/ToolChains/Hurd.cpp
+++ b/clang/lib/Driver/ToolChains/Hurd.cpp
@@ -10,7 +10,7 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 0f2ceff339eff..f452109134171 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -16,8 +16,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Path.h"
@@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (DriverArgs.hasArg(clang::driver::options::OPT_fopenmp)) {
diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
index 3cc56bb7e832e..9eca1ad5f2865 100644
--- a/clang/lib/Driver/ToolChains/MSP430.cpp
+++ b/clang/lib/Driver/ToolChains/MSP430.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/InputInfo.h"
 #include "clang/Driver/Multilib.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index e9972cb24c245..0d4bbabb9bb8a 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/ConvertUTF.h"
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
index 1bbabdfc631b8..da4a9072317f4 100644
--- a/clang/lib/Driver/ToolChains/Managarm.cpp
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -11,8 +11,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 
@@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const Driver &D = getDriver();
   std::string SysRoot = computeSysRoot();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 0e8eeeb0523ce..bd0e40ae3d7ad 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp
index 58d6b5031f536..7dd3936613296 100644
--- a/clang/lib/Driver/ToolChains/MipsLinux.cpp
+++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp
@@ -9,7 +9,7 @@
 #include "MipsLinux.h"
 #include "Arch/Mips.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D,
 
 void MipsLLVMToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index ea722b59853d6..8db00deeb80df 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -14,8 +14,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp
index 607eb714f85dc..00991504e97a8 100644
--- a/clang/lib/Driver/ToolChains/OHOS.cpp
+++ b/clang/lib/Driver/ToolChains/OHOS.cpp
@@ -12,8 +12,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/FileSystem.h"
@@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
 
 ToolChain::RuntimeLibType OHOS::GetRuntimeLibType(
     const ArgList &Args) const {
-  if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) {
+  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) {
     StringRef Value = A->getValue();
     if (Value != "compiler-rt")
       getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name)
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 5e7b4f1a664e6..8f589186af343 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -13,8 +13,8 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs(
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
index 76180431ee682..8d381c4f14371 100644
--- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCFreeBSD.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang::driver::toolchains;
@@ -16,7 +16,7 @@ using namespace llvm::opt;
 
 void PPCFreeBSDToolChain::AddClangSystemIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp
index 672ebd5b7b98d..768214e416bd7 100644
--- a/clang/lib/Driver/ToolChains/PPCLinux.cpp
+++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp
@@ -8,7 +8,7 @@
 
 #include "PPCLinux.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
@@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D,
 
 void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                   ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(options::OPT_nostdinc) &&
+  if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) &&
       !DriverArgs.hasArg(options::OPT_nobuiltininc)) {
     const Driver &D = getDriver();
     SmallString<128> P(D.ResourceDir);
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 6fe18aa4cceba..34ec65ae59602 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp
index 22aad7e3f49d7..afb9e63b4b348 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.cpp
+++ b/clang/lib/Driver/ToolChains/SPIRV.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/InputInfo.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 
 using namespace clang::driver;
 using namespace clang::driver::toolchains;
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 85859f344b491..0232b047a6c4b 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector(
 
 void SYCLInstallationDetector::addSYCLIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nobuiltininc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc))
     return;
 
   // Add the SYCL header search locations in the specified order.
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index ad0f41144f393..64c7d1ceb3a36 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -13,9 +13,9 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/ToolChain.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp
index 7732e37f8061d..d2be147c7b9f6 100644
--- a/clang/lib/Driver/ToolChains/UEFI.cpp
+++ b/clang/lib/Driver/ToolChains/UEFI.cpp
@@ -11,8 +11,8 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp
index 78509bcdae0fe..ad9129046c3e1 100644
--- a/clang/lib/Driver/ToolChains/VEToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/Path.h"
 #include <cstdlib> // ::getenv
@@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; }
 
 void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   if (DriverArgs.hasArg(options::OPT_nobuiltininc) &&
@@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 15c6f19e87fee..5054868b5ff4d 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -12,7 +12,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
@@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; }
 void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
                                         ArgStringList &CC1Args,
                                         Action::OffloadKind) const {
-  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
+  if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array,
                           options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fno-use-init-array");
 
@@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const {
 
 void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                             ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc))
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
     return;
 
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp
index dd26c11affffb..6a2a75cb99739 100644
--- a/clang/lib/Driver/ToolChains/XCore.cpp
+++ b/clang/lib/Driver/ToolChains/XCore.cpp
@@ -10,7 +10,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include <cstdlib> // ::getenv
 
@@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; }
 
 void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
   if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) {
@@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void XCoreToolChain::AddClangCXXStdlibIncludeArgs(
     const ArgList &DriverArgs, ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc) ||
       DriverArgs.hasArg(options::OPT_nostdincxx))
     return;
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index eac8f623f9a50..9a3c45323a3cf 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -9,7 +9,7 @@
 #include "ZOS.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 4c2d11751a363..0325296f84b19 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -8,8 +8,8 @@
 #include "clang/Driver/XRayArgs.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/SpecialCaseList.h"
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index dac9e0d26f393..a916667208845 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -52,7 +52,6 @@ add_clang_library(clangFrontend
   clangAST
   clangBasic
   clangDriver
-  clangOptions
   clangEdit
   clangLex
   clangParse
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 0782dc1b585c3..66759d363d531 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -27,6 +27,7 @@
 #include "clang/Basic/XRayInstr.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CommandLineSourceLoc.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -37,7 +38,6 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/Options.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "clang/Serialization/ModuleFileExtension.h"
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h"
@@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() {
 using ArgumentConsumer = CompilerInvocation::ArgumentConsumer;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static llvm::StringRef lookupStrInTable(unsigned Offset) {
@@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) {
 }
 
 #define SIMPLE_ENUM_VALUE_TABLE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef SIMPLE_ENUM_VALUE_TABLE
 
 static std::optional<bool> normalizeSimpleFlag(OptSpecifier Opt,
@@ -989,7 +989,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) {
@@ -1076,7 +1076,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args,
 
 #define ANALYZER_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef ANALYZER_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) {
@@ -1583,7 +1583,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   if (Opts.OptimizationLevel > 0) {
@@ -1888,7 +1888,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
 
 #define CODEGEN_OPTION_WITH_MARSHALLING(...)                                   \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef CODEGEN_OPTION_WITH_MARSHALLING
 
   // At O0 we want to fully disable inlining outside of cases marked with
@@ -2379,7 +2379,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts,
   const DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Opts.ShowIncludesDest != ShowIncludesDestination::None)
@@ -2414,7 +2414,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts,
   DependencyOutputOptions &DependencyOutputOpts = Opts;
 #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...)                         \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING
 
   if (Args.hasArg(OPT_show_includes)) {
@@ -2542,7 +2542,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 }
 
@@ -2554,7 +2554,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args,
 
 #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...)                               \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2565,7 +2565,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts,
   const MigratorOptions &MigratorOpts = Opts;
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 }
 
@@ -2577,7 +2577,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args,
 
 #define MIGRATOR_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef MIGRATOR_OPTION_WITH_MARSHALLING
 
   return Diags.getNumErrors() == NumErrorsBefore;
@@ -2589,7 +2589,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs(
   const DiagnosticOptions *DiagnosticOpts = &Opts;
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   if (!Opts.DiagnosticSerializationFile.empty())
@@ -2694,7 +2694,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args,
 
 #define DIAG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef DIAG_OPTION_WITH_MARSHALLING
 
   llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes);
@@ -2844,7 +2844,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts,
   const FrontendOptions &FrontendOpts = Opts;
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   std::optional<OptSpecifier> ProgramActionOpt =
@@ -3014,7 +3014,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
 
 #define FRONTEND_OPTION_WITH_MARSHALLING(...)                                  \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef FRONTEND_OPTION_WITH_MARSHALLING
 
   Opts.ProgramAction = frontend::ParseSyntaxOnly;
@@ -3296,7 +3296,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
   const HeaderSearchOptions *HeaderSearchOpts = &Opts;
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (Opts.UseLibcxx)
@@ -3411,7 +3411,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
 
 #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...)                             \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ))
@@ -3744,7 +3744,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   // The '-fcf-protection=' option is generated by CodeGenOpts generator.
@@ -4153,7 +4153,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
 #define LANG_OPTION_WITH_MARSHALLING(...)                                      \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
   if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
@@ -4874,7 +4874,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate)
@@ -4948,7 +4948,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
 
 #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...)                              \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef PREPROCESSOR_OPTION_WITH_MARSHALLING
 
   Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) ||
@@ -5041,7 +5041,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP;
@@ -5062,7 +5062,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts,
 
 #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...)                       \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING
 
   Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM);
@@ -5077,7 +5077,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts,
   const TargetOptions *TargetOpts = &Opts;
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (!Opts.SDKVersion.empty())
@@ -5096,7 +5096,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
 
 #define TARGET_OPTION_WITH_MARSHALLING(...)                                    \
   PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__)
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef TARGET_OPTION_WITH_MARSHALLING
 
   if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) {
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 73b81ed906808..99212b81fe064 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -14,11 +14,11 @@
 #include "clang/Driver/Action.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Option/ArgList.h"
@@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef<const char *> ArgList,
   if (!C)
     return nullptr;
 
-  if (C->getArgs().hasArg(options::OPT_fdriver_only))
+  if (C->getArgs().hasArg(driver::options::OPT_fdriver_only))
     return nullptr;
 
   // Just print the cc1 options if -### was present.
-  if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) {
+  if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) {
     C->getJobs().Print(llvm::errs(), "\n", true);
     return nullptr;
   }
diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt
index 66213f76eb968..061e54c3e62d0 100644
--- a/clang/lib/FrontendTool/CMakeLists.txt
+++ b/clang/lib/FrontendTool/CMakeLists.txt
@@ -7,7 +7,6 @@ set(link_libs
   clangBasic
   clangCodeGen
   clangDriver
-  clangOptions
   clangExtractAPI
   clangFrontend
   clangRewriteFrontend
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index e571193c6a9c8..c8aad4daa1c10 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -13,6 +13,7 @@
 
 #include "clang/CodeGen/CodeGenAction.h"
 #include "clang/Config/config.h"
+#include "clang/Driver/Options.h"
 #include "clang/ExtractAPI/FrontendActions.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -21,7 +22,6 @@
 #include "clang/Frontend/FrontendPluginRegistry.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
-#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) {
 
   // Honor -help.
   if (Clang->getFrontendOpts().ShowHelp) {
-    getDriverOptTable().printHelp(
+    driver::getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1 [options] file...",
         "LLVM 'Clang' Compiler: http://clang.llvm.org",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(options::CC1Option));
+        llvm::opt::Visibility(driver::options::CC1Option));
     return true;
   }
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 7764fa7dc92b9..76338065d2231 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -33,6 +33,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
@@ -42,7 +43,6 @@
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/Options.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT,
   llvm::ArrayRef<const char *> RF = llvm::ArrayRef(ClangArgv);
   std::unique_ptr<driver::Compilation> Compilation(Driver.BuildCompilation(RF));
 
-  if (Compilation->getArgs().hasArg(options::OPT_v))
+  if (Compilation->getArgs().hasArg(driver::options::OPT_v))
     Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false);
 
   auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get());
diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h
index 4efe8b9fbc6cc..fbf9814b0d4a7 100644
--- a/clang/lib/Interpreter/InterpreterUtils.h
+++ b/clang/lib/Interpreter/InterpreterUtils.h
@@ -21,11 +21,11 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/Options.h"
 
 #include "clang/Sema/Lookup.h"
 #include "llvm/IR/Module.h"
diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt
deleted file mode 100644
index a762e9918b41c..0000000000000
--- a/clang/lib/Options/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Option
-  Support
-)
-
-add_clang_library(clangOptions
-  DriverOptions.cpp
-  OptionUtils.cpp
-  
-  DEPENDS
-  ClangDriverOptions
-  # These generated headers are included transitively.
-  target_parser_gen
-
-  LINK_LIBS
-  clangBasic
-  ${system_libs}
-)
diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt
index faaa53276d0e6..fc1f1f9f9d367 100644
--- a/clang/lib/Tooling/CMakeLists.txt
+++ b/clang/lib/Tooling/CMakeLists.txt
@@ -40,7 +40,6 @@ add_clang_library(clangTooling
   clangASTMatchers
   clangBasic
   clangDriver
-  clangOptions
   clangFormat
   clangFrontend
   clangLex
diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
index e9b72388ae4df..28568426a6c48 100644
--- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
@@ -44,8 +44,8 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Types.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -164,11 +164,11 @@ struct TransferableCommand {
     // We parse each argument individually so that we can retain the exact
     // spelling of each argument; re-rendering is lossy for aliased flags.
     // E.g. in CL mode, /W4 maps to -Wall.
-    auto &OptTable = getDriverOptTable();
+    auto &OptTable = clang::driver::getDriverOptTable();
     if (!OldArgs.empty())
       Cmd.CommandLine.emplace_back(OldArgs.front());
     for (unsigned Pos = 1; Pos < OldArgs.size();) {
-      using namespace options;
+      using namespace driver::options;
 
       const unsigned OldPos = Pos;
       std::unique_ptr<llvm::opt::Arg> Arg(OptTable.ParseOneArg(
@@ -296,14 +296,14 @@ struct TransferableCommand {
   // Try to interpret the argument as a type specifier, e.g. '-x'.
   std::optional<types::ID> tryParseTypeArg(const llvm::opt::Arg &Arg) {
     const llvm::opt::Option &Opt = Arg.getOption();
-    using namespace options;
+    using namespace driver::options;
     if (ClangCLMode) {
       if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc))
         return types::TY_C;
       if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp))
         return types::TY_CXX;
     } else {
-      if (Opt.matches(options::OPT_x))
+      if (Opt.matches(driver::options::OPT_x))
         return types::lookupTypeForTypeSpecifier(Arg.getValue());
     }
     return std::nullopt;
@@ -311,7 +311,7 @@ struct TransferableCommand {
 
   // Try to interpret the argument as '-std='.
   std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
-    using namespace options;
+    using namespace driver::options;
     if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
       // "c++latest" is not a recognized LangStandard, but it's accepted by
       // the clang driver in CL mode.
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 1f6a5c94601fc..e8eef5ed9c9fa 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -21,6 +21,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Job.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ASTUnit.h"
@@ -31,7 +32,6 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/Options.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -270,15 +270,17 @@ void addTargetAndModeForProgramName(std::vector<std::string> &CommandLine,
                                     StringRef InvokedAs) {
   if (CommandLine.empty() || InvokedAs.empty())
     return;
-  const auto &Table = getDriverOptTable();
+  const auto &Table = driver::getDriverOptTable();
   // --target=X
-  StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName();
+  StringRef TargetOPT =
+      Table.getOption(driver::options::OPT_target).getPrefixedName();
   // -target X
   StringRef TargetOPTLegacy =
-      Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName();
+      Table.getOption(driver::options::OPT_target_legacy_spelling)
+          .getPrefixedName();
   // --driver-mode=X
   StringRef DriverModeOPT =
-      Table.getOption(options::OPT_driver_mode).getPrefixedName();
+      Table.getOption(driver::options::OPT_driver_mode).getPrefixedName();
   auto TargetMode =
       driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs);
   // No need to search for target args if we don't have a target/mode to insert.
diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt
index 1efcc91fcaec9..5493aa4237aee 100644
--- a/clang/tools/clang-check/CMakeLists.txt
+++ b/clang/tools/clang-check/CMakeLists.txt
@@ -14,7 +14,6 @@ clang_target_link_libraries(clang-check
   clangBasic
   clangDriver
   clangFrontend
-  clangOptions
   clangRewriteFrontend
   clangSerialization
   clangStaticAnalyzerFrontend
diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp
index 80255c647b98f..fa6dd06a1ee58 100644
--- a/clang/tools/clang-check/ClangCheck.cpp
+++ b/clang/tools/clang-check/ClangCheck.cpp
@@ -16,9 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/ASTConsumer.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
-#include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FixItRewriter.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
@@ -34,8 +34,8 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
 
+using namespace clang::driver;
 using namespace clang::tooling;
-using namespace clang;
 using namespace llvm;
 
 static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index 54bc80486472f..9c0d9dff7dc7f 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -25,7 +25,6 @@ clang_target_link_libraries(clang-installapi
   clangAST
   clangInstallAPI
   clangBasic
-  clangOptions
   clangDriver
   clangFrontend
   clangTooling
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 8bef9690ad855..4e66485343b89 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -35,7 +35,7 @@
 
 using namespace clang;
 using namespace clang::installapi;
-using namespace clang::options;
+using namespace clang::driver::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
@@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose,
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   DiagnosticOptions DiagOpts;
-  const llvm::opt::OptTable &ClangOpts = getDriverOptTable();
+  const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable();
   unsigned MissingArgIndex, MissingArgCount;
   llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs(
       ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index f484d6f33ad8f..64324a3f8b010 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -26,6 +26,8 @@ using namespace llvm;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+namespace drv = clang::driver::options;
+
 namespace clang {
 namespace installapi {
 
@@ -107,7 +109,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table,
 
 bool Options::processDriverOptions(InputArgList &Args) {
   // Handle inputs.
-  for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) {
+  for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) {
     // Assume any input that is not a directory is a filelist.
     // InstallAPI does not accept multiple directories, so retain the last one.
     if (FM->getOptionalDirectoryRef(Path))
@@ -118,7 +120,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Handle output.
   SmallString<PATH_MAX> OutputPath;
-  if (auto *Arg = Args.getLastArg(options::OPT_o)) {
+  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
     OutputPath = Arg->getValue();
     if (OutputPath != "-")
       FM->makeAbsolutePath(OutputPath);
@@ -130,10 +132,10 @@ bool Options::processDriverOptions(InputArgList &Args) {
   }
 
   // Do basic error checking first for mixing -target and -arch options.
-  auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch);
-  auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target);
+  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
+  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
   auto *ArgTargetVariant =
-      Args.getLastArgNoClaim(options::OPT_darwin_target_variant);
+      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant);
   if (ArgArch && (ArgTarget || ArgTargetVariant)) {
     Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
         << ArgArch->getAsString(Args)
@@ -141,7 +143,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     return false;
   }
 
-  auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ);
+  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
   if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
     Diags->Report(clang::diag::err_drv_cannot_mix_options)
         << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
@@ -150,7 +152,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target triples first.
   if (ArgTarget) {
-    for (const Arg *A : Args.filtered(options::OPT_target)) {
+    for (const Arg *A : Args.filtered(drv::OPT_target)) {
       A->claim();
       llvm::Triple TargetTriple(A->getValue());
       Target TAPITarget = Target(TargetTriple);
@@ -166,7 +168,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target variants.
   DriverOpts.Zippered = ArgTargetVariant != nullptr;
-  for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) {
+  for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) {
     A->claim();
     Triple Variant(A->getValue());
     if (Variant.getVendor() != Triple::Apple) {
@@ -211,7 +213,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     DriverOpts.Targets[TAPIVariant] = Variant;
   }
 
-  DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v);
+  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
 
   return true;
 }
@@ -405,7 +407,7 @@ bool Options::processOptionList(InputArgList &Args,
 
 bool Options::processLinkerOptions(InputArgList &Args) {
   // Handle required arguments.
-  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
+  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
     LinkerOpts.InstallName = A->getValue();
   if (LinkerOpts.InstallName.empty()) {
     Diags->Report(diag::err_no_install_name);
@@ -413,29 +415,28 @@ bool Options::processLinkerOptions(InputArgList &Args) {
   }
 
   // Defaulted or optional arguments.
-  if (auto *Arg = Args.getLastArg(options::OPT_current__version))
+  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
     LinkerOpts.CurrentVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version))
+  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
     LinkerOpts.CompatVersion.parse64(Arg->getValue());
 
-  if (auto *Arg = Args.getLastArg(options::OPT_umbrella))
+  if (auto *Arg = Args.getLastArg(drv::OPT_umbrella))
     LinkerOpts.ParentUmbrella = Arg->getValue();
 
-  LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib);
+  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
 
-  for (auto *Arg : Args.filtered(options::OPT_alias_list)) {
+  for (auto *Arg : Args.filtered(drv::OPT_alias_list)) {
     LinkerOpts.AliasLists.emplace_back(Arg->getValue());
     Arg->claim();
   }
 
-  LinkerOpts.AppExtensionSafe =
-      Args.hasFlag(options::OPT_fapplication_extension,
-                   options::OPT_fno_application_extension,
-                   /*Default=*/LinkerOpts.AppExtensionSafe);
+  LinkerOpts.AppExtensionSafe = Args.hasFlag(
+      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
+      /*Default=*/LinkerOpts.AppExtensionSafe);
 
   if (::getenv("LD_NO_ENCRYPT") != nullptr)
     LinkerOpts.AppExtensionSafe = true;
@@ -445,7 +446,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 
   // Capture library paths.
   PathSeq LibraryPaths;
-  for (const Arg *A : Args.filtered(options::OPT_L)) {
+  for (const Arg *A : Args.filtered(drv::OPT_L)) {
     LibraryPaths.emplace_back(A->getValue());
     A->claim();
   }
@@ -460,7 +461,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 // invocations.
 bool Options::processFrontendOptions(InputArgList &Args) {
   // Capture language mode.
-  if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) {
+  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
     FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
                           .Case("c", clang::Language::C)
                           .Case("c++", clang::Language::CXX)
@@ -474,15 +475,15 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       return false;
     }
   }
-  for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) {
-    if (A->getOption().matches(options::OPT_ObjC))
+  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
+    if (A->getOption().matches(drv::OPT_ObjC))
       FEOpts.LangMode = clang::Language::ObjC;
     else
       FEOpts.LangMode = clang::Language::ObjCXX;
   }
 
   // Capture Sysroot.
-  if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) {
+  if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) {
     SmallString<PATH_MAX> Path(A->getValue());
     FM->makeAbsolutePath(Path);
     if (!FM->getOptionalDirectoryRef(Path)) {
@@ -501,13 +502,13 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   }
 
   // Capture system frameworks for all platforms.
-  for (const Arg *A : Args.filtered(options::OPT_iframework))
+  for (const Arg *A : Args.filtered(drv::OPT_iframework))
     FEOpts.SystemFwkPaths.emplace_back(A->getValue(),
                                        std::optional<PlatformType>{});
 
   // Capture framework paths.
   PathSeq FrameworkPaths;
-  for (const Arg *A : Args.filtered(options::OPT_F))
+  for (const Arg *A : Args.filtered(drv::OPT_F))
     FrameworkPaths.emplace_back(A->getValue());
 
   if (!FrameworkPaths.empty())
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index db9e0de9b59e0..a9fa61f9f75ee 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -63,7 +63,6 @@ clang_target_link_libraries(clang
   clangDriver
   clangFrontend
   clangFrontendTool
-  clangOptions
   clangSerialization
   )
 
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 2aef75597fc5f..52cffa4ccbe1f 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -17,6 +17,7 @@
 #include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -24,7 +25,6 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/FrontendTool/Utils.h"
-#include "clang/Options/Options.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index f13812f2a8383..50da2f8449a22 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -14,10 +14,10 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -59,7 +59,8 @@
 #include <optional>
 #include <system_error>
 using namespace clang;
-using namespace clang::options;
+using namespace clang::driver;
+using namespace clang::driver::options;
 using namespace llvm;
 using namespace llvm::opt;
 
@@ -687,7 +688,8 @@ int cc1as_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
     getDriverOptTable().printHelp(
         llvm::outs(), "clang -cc1as [options] file...",
         "Clang Integrated Assembler", /*ShowHidden=*/false,
-        /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption));
+        /*ShowAllAliases=*/false,
+        llvm::opt::Visibility(driver::options::CC1AsOption));
 
     return 0;
   }
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index dffa6ba0e4c91..6cf64d70c9399 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -18,13 +18,13 @@
 #include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/DriverDiagnostic.h"
+#include "clang/Driver/Options.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/ChainedDiagnosticConsumer.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/SerializedDiagnosticPrinter.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Frontend/Utils.h"
-#include "clang/Options/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index e0454f190b35a..62274235c53f5 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)};
   EXPECT_NE(TranslatedArgs, nullptr);
   if (TranslatedArgs) {
-    auto *A =
-        TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version);
+    auto *A = TranslatedArgs->getLastArg(
+        clang::driver::options::OPT_dxil_validator_version);
     EXPECT_NE(A, nullptr);
     if (A) {
       EXPECT_STREQ(A->getValue(), "1.1");
diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html
index 3e5e84b5b2ed4..ae0ec1d4d12cb 100755
--- a/clang/www/OpenProjects.html
+++ b/clang/www/OpenProjects.html
@@ -38,7 +38,7 @@ <h1>Open Clang Projects</h1>
   <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Basic/DiagnosticDocs.td">
       diagnostic group flags</a> (adding code examples of what is diagnosed, or
       other relevant information), or</li>
-  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Options/Options.td">
+  <li>documenting <a href="https://github.com/llvm/llvm-project/blob/main/clang/include/clang/Driver/Options.td">
       command line options</a>, or</li>
   <li>help with completing other missing documentation.</li>
 </ul>
diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt
index b183d6add1059..568f942cb4aa6 100644
--- a/flang/docs/CMakeLists.txt
+++ b/flang/docs/CMakeLists.txt
@@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target)
   endif()
   get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY)
   list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}")
-  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/")
+  list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/")
   clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target})
 endfunction()
 
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 9953f2252218b..3286171bb1499 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -76,7 +76,7 @@ will ignore it when used without `Xflang`.
 As hinted above, `flang` and `flang -fc1` are two separate tools. The
 fact that these tools are accessed through one binary, `flang`, is just an
 implementation detail. Each tool has a separate list of options, albeit defined
-in the same file: `clang/include/clang/Options/Options.td`.
+in the same file: `clang/include/clang/Driver/Options.td`.
 
 The separation helps us split various tasks and allows us to implement more
 specialised tools. In particular, `flang` is not aware of various
@@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to:
   as linkers and assemblers.
 One implication of this dependency on Clang is that all of Flang's compiler
 options are defined alongside Clang's options in
-`clang/include/clang/Options/Options.td`. For options that are common for both
+`clang/include/clang/Driver/Options.td`. For options that are common for both
 Flang and Clang, the corresponding definitions are shared.
 
 Internally, a `clangDriver` based compiler driver works by creating actions
@@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps:
 
 ### Option Definition
 All of Flang's compiler and frontend driver options are defined in
-`clang/include/clang/Options/Options.td` in Clang. When adding a new option to
+`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to
 Flang, you will either:
   * extend the existing definition for an option that is already available
     in one of Clang's drivers (e.g.  `clang`), but not yet available in Flang, or
@@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g.
 `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in
 `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.:
 ```cpp
-    case clang::options::OPT_fsyntax_only:
+    case clang::driver::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
 ```
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index bb0b4a39cec9b..2b3bc0e9c2269 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -76,7 +76,6 @@ add_flang_library(flangFrontend
   CLANG_LIBS
   clangBasic
   clangDriver
-  clangOptions
 )
 
 target_precompile_headers(flangFrontend PRIVATE
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 8cb9a24cd58b8..5eba5e4cc8a53 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -26,8 +26,8 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/OptionUtils.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/OptionUtils.h"
+#include "clang/Driver/Options.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args,
 
   for (auto *a : args) {
     const llvm::opt::Option &opt = a->getOption();
-    if (opt.matches(clang::options::OPT_fcolor_diagnostics)) {
+    if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) {
       showColors = Colors_On;
-    } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) {
+    } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) {
       showColors = Colors_Off;
-    } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) {
+    } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) {
       llvm::StringRef value(a->getValue());
       if (value == "always")
         showColors = Colors_On;
@@ -107,13 +107,15 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   unsigned defaultOpt = 0;
 
-  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) {
-    if (a->getOption().matches(clang::options::OPT_O0))
+  if (llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_O_Group)) {
+    if (a->getOption().matches(clang::driver::options::OPT_O0))
       return 0;
 
-    assert(a->getOption().matches(clang::options::OPT_O));
+    assert(a->getOption().matches(clang::driver::options::OPT_O));
 
-    return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags);
+    return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt,
+                              diags);
   }
 
   return defaultOpt;
@@ -131,7 +133,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
                            clang::DiagnosticsEngine &diags) {
   using DebugInfoKind = llvm::codegenoptions::DebugInfoKind;
   if (llvm::opt::Arg *arg =
-          args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) {
     std::optional<DebugInfoKind> val =
         llvm::StringSwitch<std::optional<DebugInfoKind>>(arg->getValue())
             .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly)
@@ -156,13 +158,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
       diags.Report(debugWarning) << arg->getValue();
     }
     opts.DwarfVersion =
-        getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ,
+        getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ,
                            /*Default=*/0, diags);
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::options::OPT_split_dwarf_file))
+            args.getLastArg(clang::driver::options::OPT_split_dwarf_file))
       opts.SplitDwarfFile = a->getValue();
     if (const llvm::opt::Arg *a =
-            args.getLastArg(clang::options::OPT_split_dwarf_output))
+            args.getLastArg(clang::driver::options::OPT_split_dwarf_output))
       opts.SplitDwarfOutput = a->getValue();
   }
   return true;
@@ -172,7 +174,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
                                      llvm::opt::ArgList &args,
                                      clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg =
-      args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ);
+      args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ);
   if (!arg)
     return;
 
@@ -197,7 +199,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
 static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts,
                               llvm::opt::ArgList &args,
                               clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib);
+  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib);
   if (!arg)
     return true;
 
@@ -235,7 +237,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags,
   CodeGenOptions::OptRemark result;
 
   for (llvm::opt::Arg *a : args) {
-    if (a->getOption().matches(clang::options::OPT_R_Joined)) {
+    if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) {
       llvm::StringRef value = a->getValue();
 
       if (value == remarkOptName) {
@@ -272,39 +274,39 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                              clang::DiagnosticsEngine &diags) {
   opts.OptimizationLevel = getOptimizationLevel(args, diags);
 
-  if (args.hasFlag(clang::options::OPT_fdebug_pass_manager,
-                   clang::options::OPT_fno_debug_pass_manager, false))
+  if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager,
+                   clang::driver::options::OPT_fno_debug_pass_manager, false))
     opts.DebugPassManager = 1;
 
-  if (args.hasFlag(clang::options::OPT_fstack_arrays,
-                   clang::options::OPT_fno_stack_arrays, false))
+  if (args.hasFlag(clang::driver::options::OPT_fstack_arrays,
+                   clang::driver::options::OPT_fno_stack_arrays, false))
     opts.StackArrays = 1;
 
-  if (args.getLastArg(clang::options::OPT_floop_interchange))
+  if (args.getLastArg(clang::driver::options::OPT_floop_interchange))
     opts.InterchangeLoops = 1;
 
-  if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion))
+  if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion))
     opts.FuseLoops = 1;
 
-  if (args.getLastArg(clang::options::OPT_vectorize_loops))
+  if (args.getLastArg(clang::driver::options::OPT_vectorize_loops))
     opts.VectorizeLoop = 1;
 
-  if (args.getLastArg(clang::options::OPT_vectorize_slp))
+  if (args.getLastArg(clang::driver::options::OPT_vectorize_slp))
     opts.VectorizeSLP = 1;
 
-  if (args.hasFlag(clang::options::OPT_floop_versioning,
-                   clang::options::OPT_fno_loop_versioning, false))
+  if (args.hasFlag(clang::driver::options::OPT_floop_versioning,
+                   clang::driver::options::OPT_fno_loop_versioning, false))
     opts.LoopVersioning = 1;
 
-  opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops,
-                                  clang::options::OPT_fno_unroll_loops,
+  opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops,
+                                  clang::driver::options::OPT_fno_unroll_loops,
                                   (opts.OptimizationLevel > 1));
 
   opts.AliasAnalysis = opts.OptimizationLevel > 0;
 
   // -mframe-pointer=none/non-leaf/reserved/all option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) {
     std::optional<llvm::FramePointerKind> val =
         llvm::StringSwitch<std::optional<llvm::FramePointerKind>>(a->getValue())
             .Case("none", llvm::FramePointerKind::None)
@@ -320,7 +322,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setFramePointer(val.value());
   }
 
-  for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ))
+  for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ))
     opts.LLVMPassPlugins.push_back(a->getValue());
 
   opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args);
@@ -329,14 +331,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       clang::driver::tools::parseMPreferVectorWidthOption(diags, args);
 
   // -fembed-offload-object option
-  for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ))
+  for (auto *a :
+       args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ))
     opts.OffloadObjects.push_back(a->getValue());
 
-  if (args.hasArg(clang::options::OPT_finstrument_functions))
+  if (args.hasArg(clang::driver::options::OPT_finstrument_functions))
     opts.InstrumentFunctions = 1;
 
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) {
+  if (const llvm::opt::Arg *a = args.getLastArg(
+          clang::driver::options::OPT_mcode_object_version_EQ)) {
     llvm::StringRef s = a->getValue();
     if (s == "6")
       opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6;
@@ -350,36 +353,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 
   // -f[no-]save-optimization-record[=<format>]
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_opt_record_file))
+          args.getLastArg(clang::driver::options::OPT_opt_record_file))
     opts.OptRecordFile = a->getValue();
 
   // Optimization file format. Defaults to yaml
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_opt_record_format))
+          args.getLastArg(clang::driver::options::OPT_opt_record_format))
     opts.OptRecordFormat = a->getValue();
 
   // Specifies, using a regex, which successful optimization passes(middle and
   // backend), to include in the final optimization record file generated. If
   // not provided -fsave-optimization-record will include all passes.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_opt_record_passes))
+          args.getLastArg(clang::driver::options::OPT_opt_record_passes))
     opts.OptRecordPasses = a->getValue();
 
   // Create OptRemark that allows printing of all successful optimization
   // passes applied.
   opts.OptimizationRemark =
-      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ,
+      parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ,
                               /*remarkOptName=*/"pass");
 
   // Create OptRemark that allows all missed optimization passes to be printed.
-  opts.OptimizationRemarkMissed =
-      parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ,
-                              /*remarkOptName=*/"pass-missed");
+  opts.OptimizationRemarkMissed = parseOptimizationRemark(
+      diags, args, clang::driver::options::OPT_Rpass_missed_EQ,
+      /*remarkOptName=*/"pass-missed");
 
   // Create OptRemark that allows all optimization decisions made by LLVM
   // to be printed.
   opts.OptimizationRemarkAnalysis = parseOptimizationRemark(
-      diags, args, clang::options::OPT_Rpass_analysis_EQ,
+      diags, args, clang::driver::options::OPT_Rpass_analysis_EQ,
       /*remarkOptName=*/"pass-analysis");
 
   if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) {
@@ -397,22 +400,23 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly);
   }
 
-  if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ))
+  if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ))
     opts.SaveTempsDir = a->getValue();
 
   // -record-command-line option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_record_command_line)) {
+          args.getLastArg(clang::driver::options::OPT_record_command_line)) {
     opts.RecordCommandLine = a->getValue();
   }
 
   // -mlink-builtin-bitcode
-  for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode))
+  for (auto *a :
+       args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode))
     opts.BuiltinBCLibs.push_back(a->getValue());
 
   // -mrelocation-model option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_mrelocation_model)) {
+          args.getLastArg(clang::driver::options::OPT_mrelocation_model)) {
     llvm::StringRef modelName = a->getValue();
     auto relocModel =
         llvm::StringSwitch<std::optional<llvm::Reloc::Model>>(modelName)
@@ -431,30 +435,31 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // -pic-level and -pic-is-pie option.
-  if (int picLevel =
-          getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) {
+  if (int picLevel = getLastArgIntValue(
+          args, clang::driver::options::OPT_pic_level, 0, diags)) {
     if (picLevel > 2)
       diags.Report(clang::diag::err_drv_invalid_value)
-          << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args)
+          << args.getLastArg(clang::driver::options::OPT_pic_level)
+                 ->getAsString(args)
           << picLevel;
 
     opts.PICLevel = picLevel;
-    if (args.hasArg(clang::options::OPT_pic_is_pie))
+    if (args.hasArg(clang::driver::options::OPT_pic_is_pie))
       opts.IsPIE = 1;
   }
 
-  if (args.hasArg(clang::options::OPT_fprofile_generate)) {
+  if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) {
     opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr);
   }
 
-  if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) {
+  if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) {
     opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
     opts.ProfileInstrumentUsePath = A->getValue();
   }
 
   // -mcmodel option.
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_mcmodel_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) {
     llvm::StringRef modelName = a->getValue();
     std::optional<llvm::CodeModel::Model> codeModel = getCodeModel(modelName);
 
@@ -465,8 +470,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
           << a->getAsString(args) << modelName;
   }
 
-  if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) {
+  if (const llvm::opt::Arg *arg = args.getLastArg(
+          clang::driver::options::OPT_mlarge_data_threshold_EQ)) {
     uint64_t LDT;
     if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) {
       diags.Report(clang::diag::err_drv_invalid_value)
@@ -476,8 +481,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   }
 
   // This option is compatible with -f[no-]underscoring in gfortran.
-  if (args.hasFlag(clang::options::OPT_fno_underscoring,
-                   clang::options::OPT_funderscoring, false)) {
+  if (args.hasFlag(clang::driver::options::OPT_fno_underscoring,
+                   clang::driver::options::OPT_funderscoring, false)) {
     opts.Underscoring = 0;
   }
 
@@ -494,7 +499,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
                    clang::driver::options::OPT_fno_defer_desc_map, true);
 
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::options::OPT_complex_range_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     if (argValue == "full") {
       opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full);
@@ -515,42 +520,46 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 /// \param [in] opts The target options instance to update
 /// \param [in] args The list of input arguments (from the compiler invocation)
 static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple))
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_triple))
     opts.triple = a->getValue();
 
-  opts.atomicIgnoreDenormalMode =
-      args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode,
-                   clang::options::OPT_fno_atomic_ignore_denormal_mode, false);
-  opts.atomicFineGrainedMemory =
-      args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory,
-                   clang::options::OPT_fno_atomic_fine_grained_memory, false);
+  opts.atomicIgnoreDenormalMode = args.hasFlag(
+      clang::driver::options::OPT_fatomic_ignore_denormal_mode,
+      clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false);
+  opts.atomicFineGrainedMemory = args.hasFlag(
+      clang::driver::options::OPT_fatomic_fine_grained_memory,
+      clang::driver::options::OPT_fno_atomic_fine_grained_memory, false);
   opts.atomicRemoteMemory =
-      args.hasFlag(clang::options::OPT_fatomic_remote_memory,
-                   clang::options::OPT_fno_atomic_remote_memory, false);
+      args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory,
+                   clang::driver::options::OPT_fno_atomic_remote_memory, false);
 
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu))
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_target_cpu))
     opts.cpu = a->getValue();
 
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu))
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_tune_cpu))
     opts.cpuToTuneFor = a->getValue();
 
   for (const llvm::opt::Arg *currentArg :
-       args.filtered(clang::options::OPT_target_feature))
+       args.filtered(clang::driver::options::OPT_target_feature))
     opts.featuresAsWritten.emplace_back(currentArg->getValue());
 
-  if (args.hasArg(clang::options::OPT_fdisable_real_10))
+  if (args.hasArg(clang::driver::options::OPT_fdisable_real_10))
     opts.disabledRealKinds.push_back(10);
 
-  if (args.hasArg(clang::options::OPT_fdisable_real_3))
+  if (args.hasArg(clang::driver::options::OPT_fdisable_real_3))
     opts.disabledRealKinds.push_back(3);
 
-  if (args.hasArg(clang::options::OPT_fdisable_integer_2))
+  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2))
     opts.disabledIntegerKinds.push_back(2);
 
-  if (args.hasArg(clang::options::OPT_fdisable_integer_16))
+  if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16))
     opts.disabledIntegerKinds.push_back(16);
 
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_mabi_EQ)) {
     opts.abi = a->getValue();
     llvm::StringRef V = a->getValue();
     if (V == "vec-extabi") {
@@ -560,8 +569,9 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) {
     }
   }
 
-  opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm,
-                                 clang::options::OPT_fno_verbose_asm, false);
+  opts.asmVerbose =
+      args.hasFlag(clang::driver::options::OPT_fverbose_asm,
+                   clang::driver::options::OPT_fno_verbose_asm, false);
 }
 // Tweak the frontend configuration based on the frontend action
 static void setUpFrontendBasedOnAction(FrontendOptions &opts) {
@@ -594,11 +604,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   // Treat multiple action options as an invocation error. Note that `clang
   // -cc1` does accept multiple action options, but will only consider the
   // rightmost one.
-  if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) {
+  if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) {
     llvm::SmallString<32> buf;
     llvm::raw_svector_ostream os(buf);
     for (const llvm::opt::Arg *arg :
-         args.filtered(clang::options::OPT_Action_Group)) {
+         args.filtered(clang::driver::options::OPT_Action_Group)) {
       if (buf.size())
         os << ", ";
       os << "'" << arg->getSpelling() << "'";
@@ -609,99 +619,99 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
 
   // Identify the action (i.e. opts.ProgramAction)
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_Action_Group)) {
+          args.getLastArg(clang::driver::options::OPT_Action_Group)) {
     switch (a->getOption().getID()) {
     default: {
       llvm_unreachable("Invalid option in group!");
     }
-    case clang::options::OPT_test_io:
+    case clang::driver::options::OPT_test_io:
       opts.programAction = InputOutputTest;
       break;
-    case clang::options::OPT_E:
+    case clang::driver::options::OPT_E:
       opts.programAction = PrintPreprocessedInput;
       break;
-    case clang::options::OPT_fsyntax_only:
+    case clang::driver::options::OPT_fsyntax_only:
       opts.programAction = ParseSyntaxOnly;
       break;
-    case clang::options::OPT_emit_fir:
+    case clang::driver::options::OPT_emit_fir:
       opts.programAction = EmitFIR;
       break;
-    case clang::options::OPT_emit_hlfir:
+    case clang::driver::options::OPT_emit_hlfir:
       opts.programAction = EmitHLFIR;
       break;
-    case clang::options::OPT_emit_llvm:
+    case clang::driver::options::OPT_emit_llvm:
       opts.programAction = EmitLLVM;
       break;
-    case clang::options::OPT_emit_llvm_bc:
+    case clang::driver::options::OPT_emit_llvm_bc:
       opts.programAction = EmitLLVMBitcode;
       break;
-    case clang::options::OPT_emit_obj:
+    case clang::driver::options::OPT_emit_obj:
       opts.programAction = EmitObj;
       break;
-    case clang::options::OPT_S:
+    case clang::driver::options::OPT_S:
       opts.programAction = EmitAssembly;
       break;
-    case clang::options::OPT_fdebug_unparse:
+    case clang::driver::options::OPT_fdebug_unparse:
       opts.programAction = DebugUnparse;
       break;
-    case clang::options::OPT_fdebug_unparse_no_sema:
+    case clang::driver::options::OPT_fdebug_unparse_no_sema:
       opts.programAction = DebugUnparseNoSema;
       break;
-    case clang::options::OPT_fdebug_unparse_with_symbols:
+    case clang::driver::options::OPT_fdebug_unparse_with_symbols:
       opts.programAction = DebugUnparseWithSymbols;
       break;
-    case clang::options::OPT_fdebug_unparse_with_modules:
+    case clang::driver::options::OPT_fdebug_unparse_with_modules:
       opts.programAction = DebugUnparseWithModules;
       break;
-    case clang::options::OPT_fdebug_dump_symbols:
+    case clang::driver::options::OPT_fdebug_dump_symbols:
       opts.programAction = DebugDumpSymbols;
       break;
-    case clang::options::OPT_fdebug_dump_parse_tree:
+    case clang::driver::options::OPT_fdebug_dump_parse_tree:
       opts.programAction = DebugDumpParseTree;
       break;
-    case clang::options::OPT_fdebug_dump_pft:
+    case clang::driver::options::OPT_fdebug_dump_pft:
       opts.programAction = DebugDumpPFT;
       break;
-    case clang::options::OPT_fdebug_dump_all:
+    case clang::driver::options::OPT_fdebug_dump_all:
       opts.programAction = DebugDumpAll;
       break;
-    case clang::options::OPT_fdebug_dump_parse_tree_no_sema:
+    case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema:
       opts.programAction = DebugDumpParseTreeNoSema;
       break;
-    case clang::options::OPT_fdebug_dump_provenance:
+    case clang::driver::options::OPT_fdebug_dump_provenance:
       opts.programAction = DebugDumpProvenance;
       break;
-    case clang::options::OPT_fdebug_dump_parsing_log:
+    case clang::driver::options::OPT_fdebug_dump_parsing_log:
       opts.programAction = DebugDumpParsingLog;
       break;
-    case clang::options::OPT_fdebug_measure_parse_tree:
+    case clang::driver::options::OPT_fdebug_measure_parse_tree:
       opts.programAction = DebugMeasureParseTree;
       break;
-    case clang::options::OPT_fdebug_pre_fir_tree:
+    case clang::driver::options::OPT_fdebug_pre_fir_tree:
       opts.programAction = DebugPreFIRTree;
       break;
-    case clang::options::OPT_fget_symbols_sources:
+    case clang::driver::options::OPT_fget_symbols_sources:
       opts.programAction = GetSymbolsSources;
       break;
-    case clang::options::OPT_fget_definition:
+    case clang::driver::options::OPT_fget_definition:
       opts.programAction = GetDefinition;
       break;
-    case clang::options::OPT_init_only:
+    case clang::driver::options::OPT_init_only:
       opts.programAction = InitOnly;
       break;
 
       // TODO:
-      // case clang::options::OPT_emit_llvm:
-      // case clang::options::OPT_emit_llvm_only:
-      // case clang::options::OPT_emit_codegen_only:
-      // case clang::options::OPT_emit_module:
+      // case clang::driver::options::OPT_emit_llvm:
+      // case clang::driver::options::OPT_emit_llvm_only:
+      // case clang::driver::options::OPT_emit_codegen_only:
+      // case clang::driver::options::OPT_emit_module:
       // (...)
     }
 
     // Parse the values provided with `-fget-definition` (there should be 3
     // integers)
     if (llvm::opt::OptSpecifier(a->getOption().getID()) ==
-        clang::options::OPT_fget_definition) {
+        clang::driver::options::OPT_fget_definition) {
       unsigned optVals[3] = {0, 0, 0};
 
       for (unsigned i = 0; i < 3; i++) {
@@ -721,25 +731,27 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Parsing -load <dsopath> option and storing shared object path
-  if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) {
+  if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) {
     opts.plugins.push_back(a->getValue());
   }
 
   // Parsing -plugin <name> option and storing plugin name and setting action
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_plugin)) {
     opts.programAction = PluginAction;
     opts.actionName = a->getValue();
   }
 
-  opts.outputFile = args.getLastArgValue(clang::options::OPT_o);
-  opts.showHelp = args.hasArg(clang::options::OPT_help);
-  opts.showVersion = args.hasArg(clang::options::OPT_version);
+  opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o);
+  opts.showHelp = args.hasArg(clang::driver::options::OPT_help);
+  opts.showVersion = args.hasArg(clang::driver::options::OPT_version);
   opts.printSupportedCPUs =
-      args.hasArg(clang::options::OPT_print_supported_cpus);
+      args.hasArg(clang::driver::options::OPT_print_supported_cpus);
 
   // Get the input kind (from the value passed via `-x`)
   InputKind dashX(Language::Unknown);
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_x)) {
     llvm::StringRef xValue = a->getValue();
     // Principal languages.
     dashX = llvm::StringSwitch<InputKind>(xValue)
@@ -766,7 +778,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
 
   // Collect the input files and save them in our instance of FrontendOptions.
   std::vector<std::string> inputs =
-      args.getAllArgValues(clang::options::OPT_INPUT);
+      args.getAllArgValues(clang::driver::options::OPT_INPUT);
   opts.inputs.clear();
   if (inputs.empty())
     // '-' is the default input if none is given.
@@ -786,16 +798,18 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set fortranForm based on options -ffree-form and -ffixed-form.
-  if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form,
-                                        clang::options::OPT_ffree_form)) {
-    opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form)
-                           ? FortranForm::FixedForm
-                           : FortranForm::FreeForm;
+  if (const auto *arg =
+          args.getLastArg(clang::driver::options::OPT_ffixed_form,
+                          clang::driver::options::OPT_ffree_form)) {
+    opts.fortranForm =
+        arg->getOption().matches(clang::driver::options::OPT_ffixed_form)
+            ? FortranForm::FixedForm
+            : FortranForm::FreeForm;
   }
 
   // Set fixedFormColumns based on -ffixed-line-length=<value>
   if (const auto *arg =
-          args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) {
     llvm::StringRef argValue = llvm::StringRef(arg->getValue());
     std::int64_t columns = -1;
     if (argValue == "none") {
@@ -817,7 +831,8 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // Set conversion based on -fconvert=<value>
-  if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) {
+  if (const auto *arg =
+          args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) {
     const char *argValue = arg->getValue();
     if (auto convert = parseConvertArg(argValue))
       opts.envDefaults.push_back({"FORT_CONVERT", *convert});
@@ -827,55 +842,65 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   }
 
   // -f{no-}implicit-none
-  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
-                       args.hasFlag(clang::options::OPT_fimplicit_none,
-                                    clang::options::OPT_fno_implicit_none,
-                                    false));
+  opts.features.Enable(
+      Fortran::common::LanguageFeature::ImplicitNoneTypeAlways,
+      args.hasFlag(clang::driver::options::OPT_fimplicit_none,
+                   clang::driver::options::OPT_fno_implicit_none, false));
 
   // -f{no-}implicit-none-ext
-  opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal,
-                       args.hasFlag(clang::options::OPT_fimplicit_none_ext,
-                                    clang::options::OPT_fno_implicit_none_ext,
-                                    false));
+  opts.features.Enable(
+      Fortran::common::LanguageFeature::ImplicitNoneExternal,
+      args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext,
+                   clang::driver::options::OPT_fno_implicit_none_ext, false));
 
   // -f{no-}backslash
   opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes,
-                       args.hasFlag(clang::options::OPT_fbackslash,
-                                    clang::options::OPT_fno_backslash, false));
+                       args.hasFlag(clang::driver::options::OPT_fbackslash,
+                                    clang::driver::options::OPT_fno_backslash,
+                                    false));
 
   // -f{no-}logical-abbreviations
   opts.features.Enable(
       Fortran::common::LanguageFeature::LogicalAbbreviations,
-      args.hasFlag(clang::options::OPT_flogical_abbreviations,
-                   clang::options::OPT_fno_logical_abbreviations, false));
+      args.hasFlag(clang::driver::options::OPT_flogical_abbreviations,
+                   clang::driver::options::OPT_fno_logical_abbreviations,
+                   false));
 
   // -f{no-}unsigned
   opts.features.Enable(Fortran::common::LanguageFeature::Unsigned,
-                       args.hasFlag(clang::options::OPT_funsigned,
-                                    clang::options::OPT_fno_unsigned, false));
+                       args.hasFlag(clang::driver::options::OPT_funsigned,
+                                    clang::driver::options::OPT_fno_unsigned,
+                                    false));
 
   // -f{no-}xor-operator
-  opts.features.Enable(Fortran::common::LanguageFeature::XOROperator,
-                       args.hasFlag(clang::options::OPT_fxor_operator,
-                                    clang::options::OPT_fno_xor_operator,
-                                    false));
+  opts.features.Enable(
+      Fortran::common::LanguageFeature::XOROperator,
+      args.hasFlag(clang::driver::options::OPT_fxor_operator,
+                   clang::driver::options::OPT_fno_xor_operator, false));
 
   // -fno-automatic
-  if (args.hasArg(clang::options::OPT_fno_automatic)) {
+  if (args.hasArg(clang::driver::options::OPT_fno_automatic)) {
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
   // -f{no}-save-main-program
-  opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram,
-                       args.hasFlag(clang::options::OPT_fsave_main_program,
-                                    clang::options::OPT_fno_save_main_program,
-                                    false));
+  opts.features.Enable(
+      Fortran::common::LanguageFeature::SaveMainProgram,
+      args.hasFlag(clang::driver::options::OPT_fsave_main_program,
+                   clang::driver::options::OPT_fno_save_main_program, false));
+
+  // -ffast-amd-memory-allocator
+  if (args.hasArg(clang::driver::options::OPT_ffast_amd_memory_allocator)) {
+    opts.features.Enable(
+        (Fortran::common::LanguageFeature::AmdMemoryAllocator));
+  }
 
-  if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) {
+  if (args.hasArg(
+          clang::driver::options::OPT_falternative_parameter_statement)) {
     opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter);
   }
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::options::OPT_finput_charset_EQ)) {
+          args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) {
     llvm::StringRef argValue = arg->getValue();
     if (argValue == "utf-8") {
       opts.encoding = Fortran::parser::Encoding::UTF_8;
@@ -920,9 +945,9 @@ static std::string getOpenMPHeadersDir(const char *argv) {
 static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
                                   llvm::opt::ArgList &args) {
   // Add macros from the command line.
-  for (const auto *currentArg :
-       args.filtered(clang::options::OPT_D, clang::options::OPT_U)) {
-    if (currentArg->getOption().matches(clang::options::OPT_D)) {
+  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D,
+                                              clang::driver::options::OPT_U)) {
+    if (currentArg->getOption().matches(clang::driver::options::OPT_D)) {
       opts.addMacroDef(currentArg->getValue());
     } else {
       opts.addMacroUndef(currentArg->getValue());
@@ -930,33 +955,34 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts,
   }
 
   // Add the ordered list of -I's.
-  for (const auto *currentArg : args.filtered(clang::options::OPT_I))
+  for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I))
     opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue());
 
   // Prepend the ordered list of -intrinsic-modules-path
   // to the default location to search.
   for (const auto *currentArg :
-       args.filtered(clang::options::OPT_fintrinsic_modules_path))
+       args.filtered(clang::driver::options::OPT_fintrinsic_modules_path))
     opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue());
 
   // -cpp/-nocpp
-  if (const auto *currentArg =
-          args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp))
-    opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp))
-                          ? PPMacrosFlag::Include
-                          : PPMacrosFlag::Exclude;
+  if (const auto *currentArg = args.getLastArg(
+          clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp))
+    opts.macrosFlag =
+        (currentArg->getOption().matches(clang::driver::options::OPT_cpp))
+            ? PPMacrosFlag::Include
+            : PPMacrosFlag::Exclude;
   // Enable -cpp based on -x unless explicitly disabled with -nocpp
   if (opts.macrosFlag != PPMacrosFlag::Exclude)
-    if (const auto *dashX = args.getLastArg(clang::options::OPT_x))
+    if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x))
       opts.macrosFlag = llvm::StringSwitch<PPMacrosFlag>(dashX->getValue())
                             .Case("f95-cpp-input", PPMacrosFlag::Include)
                             .Default(opts.macrosFlag);
 
-  opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat);
+  opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat);
   opts.preprocessIncludeLines =
-      args.hasArg(clang::options::OPT_fpreprocess_include_lines);
-  opts.noLineDirectives = args.hasArg(clang::options::OPT_P);
-  opts.showMacros = args.hasArg(clang::options::OPT_dM);
+      args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines);
+  opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P);
+  opts.showMacros = args.hasArg(clang::driver::options::OPT_dM);
 }
 
 /// Parses all semantic related arguments and populates the variables
@@ -967,7 +993,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 
   // -J/module-dir option
   std::vector<std::string> moduleDirList =
-      args.getAllArgValues(clang::options::OPT_module_dir);
+      args.getAllArgValues(clang::driver::options::OPT_module_dir);
   // User can only specify one -J/-module-dir directory, but may repeat
   // -J/-module-dir as long as the directory is the same each time.
   // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html
@@ -986,25 +1012,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     res.setModuleDir(moduleDirList[0]);
 
   // -fdebug-module-writer option
-  if (args.hasArg(clang::options::OPT_fdebug_module_writer)) {
+  if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) {
     res.setDebugModuleDir(true);
   }
 
   // -fhermetic-module-files option
-  if (args.hasArg(clang::options::OPT_fhermetic_module_files)) {
+  if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) {
     res.setHermeticModuleFileOutput(true);
   }
 
   // -module-suffix
   if (const auto *moduleSuffix =
-          args.getLastArg(clang::options::OPT_module_suffix)) {
+          args.getLastArg(clang::driver::options::OPT_module_suffix)) {
     res.setModuleFileSuffix(moduleSuffix->getValue());
   }
 
   // -f{no-}analyzed-objects-for-unparse
-  res.setUseAnalyzedObjectsForUnparse(
-      args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse,
-                   clang::options::OPT_fno_analyzed_objects_for_unparse, true));
+  res.setUseAnalyzedObjectsForUnparse(args.hasFlag(
+      clang::driver::options::OPT_fanalyzed_objects_for_unparse,
+      clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true));
 
   return diags.getNumErrors() == numErrorsBefore;
 }
@@ -1021,7 +1047,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // chosen to match clang's behavior.
 
   // -pedantic
-  if (args.hasArg(clang::options::OPT_pedantic)) {
+  if (args.hasArg(clang::driver::options::OPT_pedantic)) {
     features.WarnOnAllNonstandard();
     features.WarnOnAllUsage();
     res.setEnableConformanceChecks();
@@ -1031,8 +1057,9 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -Werror option
   // TODO: Currently throws a Diagnostic for anything other than -W<error>,
   // this has to change when other -W<opt>'s are supported.
-  if (args.hasArg(clang::options::OPT_W_Joined)) {
-    const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined);
+  if (args.hasArg(clang::driver::options::OPT_W_Joined)) {
+    const auto &wArgs =
+        args.getAllArgValues(clang::driver::options::OPT_W_Joined);
     for (const auto &wArg : wArgs) {
       if (wArg == "error") {
         res.setWarnAsErr(true);
@@ -1049,7 +1076,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -w
-  if (args.hasArg(clang::options::OPT_w)) {
+  if (args.hasArg(clang::driver::options::OPT_w)) {
     features.DisableAllWarnings();
     res.setDisableWarnings();
   }
@@ -1069,7 +1096,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   unsigned numErrorsBefore = diags.getNumErrors();
 
   // -fd-lines-as-code
-  if (args.hasArg(clang::options::OPT_fd_lines_as_code)) {
+  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1082,7 +1109,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fd-lines-as-comments
-  if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) {
+  if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) {
     if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) {
       const unsigned fdLinesAsWarning = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Warning,
@@ -1095,18 +1122,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // -fdefault* family
-  if (args.hasArg(clang::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
     res.getDefaultKinds().set_defaultRealKind(8);
     res.getDefaultKinds().set_doublePrecisionKind(16);
   }
-  if (args.hasArg(clang::options::OPT_fdefault_integer_8)) {
+  if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) {
     res.getDefaultKinds().set_defaultIntegerKind(8);
     res.getDefaultKinds().set_subscriptIntegerKind(8);
     res.getDefaultKinds().set_sizeIntegerKind(8);
     res.getDefaultKinds().set_defaultLogicalKind(8);
   }
-  if (args.hasArg(clang::options::OPT_fdefault_double_8)) {
-    if (!args.hasArg(clang::options::OPT_fdefault_real_8)) {
+  if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) {
+    if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) {
       // -fdefault-double-8 has to be used with -fdefault-real-8
       // to be compatible with gfortran
       const unsigned diagID = diags.getCustomDiagID(
@@ -1117,18 +1144,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html
     res.getDefaultKinds().set_doublePrecisionKind(8);
   }
-  if (args.hasArg(clang::options::OPT_flarge_sizes))
+  if (args.hasArg(clang::driver::options::OPT_flarge_sizes))
     res.getDefaultKinds().set_sizeIntegerKind(8);
 
   // -x cuda
-  auto language = args.getLastArgValue(clang::options::OPT_x);
+  auto language = args.getLastArgValue(clang::driver::options::OPT_x);
   if (language == "cuda") {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::CUDA);
   }
 
   // -fopenacc
-  if (args.hasArg(clang::options::OPT_fopenacc)) {
+  if (args.hasArg(clang::driver::options::OPT_fopenacc)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::OpenACC);
   }
@@ -1136,8 +1163,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   // -std=f2018
   // TODO: Set proper options when more fortran standards
   // are supported.
-  if (args.hasArg(clang::options::OPT_std_EQ)) {
-    auto standard = args.getLastArgValue(clang::options::OPT_std_EQ);
+  if (args.hasArg(clang::driver::options::OPT_std_EQ)) {
+    auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ);
     // We only allow f2018 as the given standard
     if (standard == "f2018") {
       res.setEnableConformanceChecks();
@@ -1150,7 +1177,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
   // -fcoarray
-  if (args.hasArg(clang::options::OPT_fcoarray)) {
+  if (args.hasArg(clang::driver::options::OPT_fcoarray)) {
     res.getFrontendOpts().features.Enable(
         Fortran::common::LanguageFeature::Coarray);
     const unsigned diagID =
@@ -1168,12 +1195,13 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
 /// generated.
 static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp,
-                                        clang::options::OPT_fno_openmp);
-  if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) {
-    bool isSimdSpecified =
-        args.hasFlag(clang::options::OPT_fopenmp_simd,
-                     clang::options::OPT_fno_openmp_simd, /*Default=*/false);
+  llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
+                                        clang::driver::options::OPT_fno_openmp);
+  if (!arg ||
+      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
+    bool isSimdSpecified = args.hasFlag(
+        clang::driver::options::OPT_fopenmp_simd,
+        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
     if (!isSimdSpecified)
       return true;
     res.getLangOpts().OpenMPSimd = 1;
@@ -1189,7 +1217,8 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   res.getLangOpts().OpenMPVersion = newestFullySupported;
   res.getFrontendOpts().features.Enable(
       Fortran::common::LanguageFeature::OpenMP);
-  if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) {
+  if (auto *arg =
+          args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) {
     llvm::ArrayRef<unsigned> ompVersions = llvm::omp::getOpenMPVersions();
     unsigned oldVersions[] = {11, 20, 25, 30};
     unsigned version = 0;
@@ -1244,16 +1273,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
   }
 
-  if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) {
+  if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) {
     res.getLangOpts().OpenMPForceUSM = 1;
   }
-  if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) {
+  if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) {
     res.getLangOpts().OpenMPIsTargetDevice = 1;
 
     // Get OpenMP host file path if any and report if a non existent file is
     // found
-    if (auto *arg =
-            args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) {
+    if (auto *arg = args.getLastArg(
+            clang::driver::options::OPT_fopenmp_host_ir_file_path)) {
       res.getLangOpts().OMPHostIRFile = arg->getValue();
       if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile))
         diags.Report(clang::diag::err_omp_host_ir_file_not_found)
@@ -1261,34 +1290,37 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
     }
 
     if (args.hasFlag(
-            clang::options::OPT_fopenmp_assume_teams_oversubscription,
-            clang::options::OPT_fno_openmp_assume_teams_oversubscription,
+            clang::driver::options::OPT_fopenmp_assume_teams_oversubscription,
+            clang::driver::options::
+                OPT_fno_openmp_assume_teams_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPTeamSubscription = true;
 
-    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state))
+    if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state))
       res.getLangOpts().OpenMPNoThreadState = 1;
 
-    if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism))
+    if (args.hasArg(
+            clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism))
       res.getLangOpts().OpenMPNoNestedParallelism = 1;
 
     if (args.hasFlag(
-            clang::options::OPT_fopenmp_assume_threads_oversubscription,
-            clang::options::OPT_fno_openmp_assume_threads_oversubscription,
+            clang::driver::options::OPT_fopenmp_assume_threads_oversubscription,
+            clang::driver::options::
+                OPT_fno_openmp_assume_threads_oversubscription,
             /*Default=*/false))
       res.getLangOpts().OpenMPThreadSubscription = true;
 
-    if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) ||
-         args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) {
-      res.getLangOpts().OpenMPTargetDebug =
-          getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ,
-                             res.getLangOpts().OpenMPTargetDebug, diags);
+    if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) ||
+         args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) {
+      res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue(
+          args, clang::driver::options::OPT_fopenmp_target_debug_EQ,
+          res.getLangOpts().OpenMPTargetDebug, diags);
 
       if (!res.getLangOpts().OpenMPTargetDebug &&
-          args.hasArg(clang::options::OPT_fopenmp_target_debug))
+          args.hasArg(clang::driver::options::OPT_fopenmp_target_debug))
         res.getLangOpts().OpenMPTargetDebug = 1;
     }
-    if (args.hasArg(clang::options::OPT_no_offloadlib))
+    if (args.hasArg(clang::driver::options::OPT_no_offloadlib))
       res.getLangOpts().NoGPULib = 1;
   }
   if (llvm::Triple(res.getTargetOpts().triple).isGPU()) {
@@ -1304,7 +1336,8 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   }
 
   // Get the OpenMP target triples if any.
-  if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) {
+  if (auto *arg =
+          args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) {
     enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit };
     auto getArchPtrSize = [](const llvm::Triple &triple) {
       if (triple.isArch16Bit())
@@ -1352,7 +1385,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc,
                                      clang::DiagnosticsEngine &diags) {
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
-  if (args.getLastArg(clang::options::OPT_fwrapv))
+  if (args.getLastArg(clang::driver::options::OPT_fwrapv))
     opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined);
 
   return true;
@@ -1371,7 +1404,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
   Fortran::common::LangOptions &opts = invoc.getLangOpts();
 
   if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_ffp_contract)) {
+          args.getLastArg(clang::driver::options::OPT_ffp_contract)) {
     const llvm::StringRef val = a->getValue();
     enum Fortran::common::LangOptions::FPModeKind fpContractMode;
 
@@ -1388,31 +1421,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(fpContractMode);
   }
 
-  if (args.getLastArg(clang::options::OPT_menable_no_infs)) {
+  if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) {
     opts.NoHonorInfs = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_menable_no_nans)) {
+  if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) {
     opts.NoHonorNaNs = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_fapprox_func)) {
+  if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) {
     opts.ApproxFunc = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) {
+  if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) {
     opts.NoSignedZeros = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_mreassociate)) {
+  if (args.getLastArg(clang::driver::options::OPT_mreassociate)) {
     opts.AssociativeMath = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_freciprocal_math)) {
+  if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) {
     opts.ReciprocalMath = true;
   }
 
-  if (args.getLastArg(clang::options::OPT_ffast_math)) {
+  if (args.getLastArg(clang::driver::options::OPT_ffast_math)) {
     opts.NoHonorInfs = true;
     opts.NoHonorNaNs = true;
     opts.AssociativeMath = true;
@@ -1422,7 +1455,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
-  if (args.hasArg(clang::options::OPT_fno_fast_real_mod))
+  if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
     opts.NoFastRealMod = true;
 
   return true;
@@ -1437,8 +1470,10 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
 /// \param [out] diags DiagnosticsEngine to report erros with
 static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
-  const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ);
-  const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ);
+  const auto *vscaleMin =
+      args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ);
+  const auto *vscaleMax =
+      args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ);
 
   if (!vscaleMin && !vscaleMax)
     return true;
@@ -1486,7 +1521,8 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // TODO: support --dependent-lib on other platforms when MLIR supports
   //       !llvm.dependent.lib
-  if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) {
+  if (args.hasArg(clang::driver::options::OPT_dependent_lib) &&
+      !triple.isOSWindows()) {
     const unsigned diagID =
         diags.getCustomDiagID(clang::DiagnosticsEngine::Error,
                               "--dependent-lib is only supported on Windows");
@@ -1494,10 +1530,12 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
     return false;
   }
 
-  opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib);
+  opts.DependentLibs =
+      args.getAllArgValues(clang::driver::options::OPT_dependent_lib);
 
   // -flto=full/thin option.
-  if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) {
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_flto_EQ)) {
     llvm::StringRef s = a->getValue();
     assert((s == "full" || s == "thin") && "Unknown LTO mode.");
     if (s == "full")
@@ -1508,10 +1546,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc,
 
   // -ffat-lto-objects
   if (const llvm::opt::Arg *arg =
-          args.getLastArg(clang::options::OPT_ffat_lto_objects,
-                          clang::options::OPT_fno_fat_lto_objects)) {
+          args.getLastArg(clang::driver::options::OPT_ffat_lto_objects,
+                          clang::driver::options::OPT_fno_fat_lto_objects)) {
     opts.PrepareForFatLTO =
-        arg->getOption().matches(clang::options::OPT_ffat_lto_objects);
+        arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects);
     if (opts.PrepareForFatLTO) {
       assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) &&
              "Unknown LTO mode");
@@ -1552,8 +1590,8 @@ bool CompilerInvocation::createFromArgs(
       llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple());
 
   // Parse the arguments
-  const llvm::opt::OptTable &opts = clang::getDriverOptTable();
-  llvm::opt::Visibility visibilityMask(clang::options::FC1Option);
+  const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable();
+  llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option);
   unsigned missingArgIndex, missingArgCount;
   llvm::opt::InputArgList args = opts.ParseArgs(
       commandLineArgs, missingArgIndex, missingArgCount, visibilityMask);
@@ -1566,7 +1604,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Issue errors on unknown arguments
-  for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) {
+  for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) {
     auto argString = a->getAsString(args);
     std::string nearest;
     if (opts.findNearest(argString, nearest, visibilityMask) > 1)
@@ -1578,15 +1616,15 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -flang-experimental-hlfir
-  if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) ||
-      args.hasArg(clang::options::OPT_emit_hlfir)) {
+  if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) ||
+      args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
     invoc.loweringOpts.setLowerToHighLevelFIR(true);
   }
 
   // -flang-deprecated-no-hlfir
-  if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) &&
-      !args.hasArg(clang::options::OPT_emit_hlfir)) {
-    if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) {
+  if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) &&
+      !args.hasArg(clang::driver::options::OPT_emit_hlfir)) {
+    if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) {
       const unsigned diagID = diags.getCustomDiagID(
           clang::DiagnosticsEngine::Error,
           "Options '-flang-experimental-hlfir' and "
@@ -1597,13 +1635,13 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -fno-ppc-native-vector-element-order
-  if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) {
+  if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) {
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
   // -f[no-]init-global-zero
-  if (args.hasFlag(clang::options::OPT_finit_global_zero,
-                   clang::options::OPT_fno_init_global_zero,
+  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
+                   clang::driver::options::OPT_fno_init_global_zero,
                    /*default=*/true))
     invoc.loweringOpts.setInitGlobalZero(true);
   else
@@ -1612,8 +1650,8 @@ bool CompilerInvocation::createFromArgs(
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
-  for (auto *a : args.filtered(clang::options::OPT_R_Group)) {
-    if (a->getOption().matches(clang::options::OPT_R_value_Group))
+  for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) {
+    if (a->getOption().matches(clang::driver::options::OPT_R_value_Group))
       // This is -Rfoo=, where foo is the name of the diagnostic
       // group. Add only the remark option name to the diagnostics. e.g. for
       // -Rpass= we will add the string "pass".
@@ -1626,19 +1664,20 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // -frealloc-lhs is the default.
-  if (!args.hasFlag(clang::options::OPT_frealloc_lhs,
-                    clang::options::OPT_fno_realloc_lhs, true))
+  if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs,
+                    clang::driver::options::OPT_fno_realloc_lhs, true))
     invoc.loweringOpts.setReallocateLHS(false);
 
-  invoc.loweringOpts.setRepackArrays(args.hasFlag(
-      clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays,
-      /*default=*/false));
+  invoc.loweringOpts.setRepackArrays(
+      args.hasFlag(clang::driver::options::OPT_frepack_arrays,
+                   clang::driver::options::OPT_fno_repack_arrays,
+                   /*default=*/false));
   invoc.loweringOpts.setStackRepackArrays(
-      args.hasFlag(clang::options::OPT_fstack_repack_arrays,
-                   clang::options::OPT_fno_stack_repack_arrays,
+      args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays,
+                   clang::driver::options::OPT_fno_stack_repack_arrays,
                    /*default=*/false));
-  if (auto *arg =
-          args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ))
+  if (auto *arg = args.getLastArg(
+          clang::driver::options::OPT_frepack_arrays_contiguity_EQ))
     invoc.loweringOpts.setRepackArraysWhole(arg->getValue() ==
                                             llvm::StringRef{"whole"});
 
@@ -1658,8 +1697,10 @@ bool CompilerInvocation::createFromArgs(
   // `mlirArgs`. Instead, you can use
   //    * `-mllvm <your-llvm-option>`, or
   //    * `-mmlir <your-mlir-option>`.
-  invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm);
-  invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir);
+  invoc.frontendOpts.llvmArgs =
+      args.getAllArgValues(clang::driver::options::OPT_mllvm);
+  invoc.frontendOpts.mlirArgs =
+      args.getAllArgValues(clang::driver::options::OPT_mmlir);
 
   success &= parseLangOptionsArgs(invoc, args, diags);
 
@@ -1683,7 +1724,7 @@ bool CompilerInvocation::createFromArgs(
   }
 
   // Process the timing-related options.
-  if (args.hasArg(clang::options::OPT_ftime_report))
+  if (args.hasArg(clang::driver::options::OPT_ftime_report))
     invoc.enableTimers = true;
 
   invoc.setArgv0(argv0);
diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt
index b69436c36d438..faf56e9d955a1 100644
--- a/flang/lib/FrontendTool/CMakeLists.txt
+++ b/flang/lib/FrontendTool/CMakeLists.txt
@@ -18,6 +18,5 @@ add_flang_library(flangFrontendTool
 
   CLANG_LIBS
   clangBasic
-  clangOptions
   clangDriver
 )
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 7586be59ba01b..09ac129d3e689 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -23,7 +23,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Options/Options.h"
+#include "clang/Driver/Options.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/BuryPointer.h"
@@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng,
 bool executeCompilerInvocation(CompilerInstance *flang) {
   // Honor -help.
   if (flang->getFrontendOpts().showHelp) {
-    clang::getDriverOptTable().printHelp(
+    clang::driver::getDriverOptTable().printHelp(
         llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler",
         /*ShowHidden=*/false, /*ShowAllAliases=*/false,
-        llvm::opt::Visibility(clang::options::FC1Option));
+        llvm::opt::Visibility(clang::driver::options::FC1Option));
     return true;
   }
 
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index 3caec17b3edbb..801fc324e888d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -26,7 +26,6 @@ target_link_libraries(flang
 clang_target_link_libraries(flang
   PRIVATE
   clangDriver
-  clangOptions
   clangBasic
 )
 
diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp
index 0840255a739f3..bd878b7a642f1 100644
--- a/flang/tools/flang-driver/driver.cpp
+++ b/flang/tools/flang-driver/driver.cpp
@@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef<const char *> argv) {
   // Any errors that would be diagnosed here will also be diagnosed later,
   // when the DiagnosticsEngine actually exists.
   unsigned missingArgIndex, missingArgCount;
-  llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs(
+  llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs(
       argv.slice(1), missingArgIndex, missingArgCount,
-      llvm::opt::Visibility(clang::options::FlangOption));
+      llvm::opt::Visibility(clang::driver::options::FlangOption));
 
   (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args);
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index bfbd85ea34203..8b4a3e0a7c3fb 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -56,7 +56,7 @@ using namespace lldb;
 using namespace lldb_private;
 
 #define OPTTABLE_STR_TABLE_CODE
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTTABLE_STR_TABLE_CODE
 
 static Status ExceptionMaskValidator(const char *string, void *unused) {
@@ -1124,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType(
 #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...)                           \
   llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET];                     \
   (void)opt_##VAR;
-#include "clang/Options/Options.inc"
+#include "clang/Driver/Options.inc"
 #undef OPTION
     minimum_version_option << '-';
     switch (sdk_type) {
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 1716492e73ff9..3b3c26fee02ec 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -406,7 +406,7 @@ class LLVM_ABI MCAsmInfo {
   // Generated object files can use all ELF features supported by GNU ld of
   // this binutils version and later. INT_MAX means all features can be used,
   // regardless of GNU ld support. The default value is referenced by
-  // clang/Options/Options.td.
+  // clang/Driver/Options.td.
   std::pair<int, int> BinutilsVersion = {2, 26};
 
   /// Should we use the integrated assembler?
diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h
index 496373d28600f..b1e56b58da684 100644
--- a/llvm/include/llvm/Option/Arg.h
+++ b/llvm/include/llvm/Option/Arg.h
@@ -51,7 +51,7 @@ class Arg {
   /// Was this argument used to affect compilation?
   ///
   /// This is used to generate an "argument unused" warning (without
-  /// clang::options::TargetSpecific) or "unsupported option" error
+  /// clang::driver::options::TargetSpecific) or "unsupported option" error
   /// (with TargetSpecific).
   mutable unsigned Claimed : 1;
 
diff --git a/revert_patches.txt b/revert_patches.txt
index b88b846f64b68..6be344f862546 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -8,3 +8,6 @@ breaks build of ROCmValidationSuite
 breaks fortran declare-target-link1
 [OMPIRBuilder] Fix addrspace of internal critical section lock (#166459
 ---
+complicated to land parallel-EQ
+Reland "[clang] Refactor option-related code from clangDriver into new clangOptions library" (#167374)
+---
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 36b71dd49ef13..4d279bffd3723 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1501,9 +1501,9 @@ cc_library(
 
 gentbl_cc_library(
     name = "driver_options_inc_gen",
-    tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]},
     tblgen = "//llvm:llvm-tblgen",
-    td_file = "include/clang/Options/Options.td",
+    td_file = "include/clang/Driver/Options.td",
     deps = ["//llvm:OptParserTdFiles"],
 )
 

From 7a73e69ac85faf2e508037e6573fe7438025c4ab Mon Sep 17 00:00:00 2001
From: Gleb Popov <6yearold@gmail.com>
Date: Tue, 11 Nov 2025 14:23:38 +0300
Subject: [PATCH 93/97] [flang-rt] Use dlsym to access char** environ on
 FreeBSD (#158477)

---
 flang-rt/lib/runtime/environment.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 97ac56236e799..2a2e19f9f17ec 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -17,6 +17,10 @@
 
 #ifdef _WIN32
 extern char **_environ;
+#elif defined(__FreeBSD__)
+// FreeBSD has environ in crt rather than libc. Using "extern char** environ"
+// in the code of a shared library makes it fail to link with -Wl,--no-undefined
+// See https://reviews.freebsd.org/D30842#840642
 #else
 extern char **environ;
 #endif
@@ -104,6 +108,11 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
 
 #ifdef _WIN32
   envp = _environ;
+#elif defined(__FreeBSD__)
+  auto envpp{reinterpret_cast<char ***>(dlsym(RTLD_DEFAULT, "environ"))};
+  if (envpp) {
+    envp = *envpp;
+  }
 #else
   envp = environ;
 #endif

From de3de3f14399074dbddf1b396f8cd080bb276057 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Tue, 11 Nov 2025 11:46:59 +0000
Subject: [PATCH 94/97] [LV] Consider interleaving when
 -enable-wide-lane-mask=true (#163387)

Currently the only way to enable the use of wide active lane masks is to pass
-enable-wide-lane-mask and force both interleaving & tail-folding with additional
flags. This patch changes selectInterleaveCount to consider interleaving if wide
lane masks were requested, although the feature remains off by default.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  29 ++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   4 -
 .../Transforms/Vectorize/VPlanTransforms.h    |   1 +
 .../AArch64/sve-wide-lane-mask.ll             | 104 ++++++++++++++++--
 4 files changed, 121 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 19823263a23bf..345bc63081b81 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,11 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
                    "Use predicated EVL instructions for tail folding. If EVL "
                    "is unsupported, fallback to data-without-lane-mask.")));
 
+cl::opt<bool> llvm::EnableWideActiveLaneMask(
+    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
+    cl::desc("Enable use of wide lane masks when used for control flow in "
+             "tail-folded loops"));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1314,6 +1319,12 @@ class LoopVectorizationCostModel {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  /// Returns true if tail-folding is preferred over a scalar epilogue.
+  bool preferPredicatedLoop() const {
+    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
+  }
+
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1374,6 +1385,17 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  /// Returns true if the use of wide lane masks is requested and the loop is
+  /// using tail-folding with a lane mask for control flow.
+  bool useWideActiveLaneMask() const {
+    if (!EnableWideActiveLaneMask)
+      return false;
+
+    TailFoldingStyle TF = getTailFoldingStyle();
+    return TF == TailFoldingStyle::DataAndControlFlow ||
+           TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+  }
+
   /// Return maximum safe number of elements to be processed per vector
   /// iteration, which do not prevent store-load forwarding and are safe with
   /// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4560,7 +4582,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  // Only interleave tail-folded loops if wide lane masks are requested, as the
+  // overhead of multiple instructions to calculate the predicate is likely
+  // not beneficial. If a scalar epilogue is not allowed for any other reason,
+  // do not interleave.
+  if (!CM.isScalarEpilogueAllowed() &&
+      !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7eee51e9f2bbf..10afd006c90c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,10 +40,6 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-static cl::opt<bool> EnableWideActiveLaneMask(
-    "enable-wide-lane-mask", cl::init(false), cl::Hidden,
-    cl::desc("Enable use of wide get active lane mask instructions"));
-
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b28559b620e13..34850743e7b62 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
 struct VFRange;
 
 extern cl::opt<bool> VerifyEachVPlan;
+extern cl::opt<bool> EnableWideActiveLaneMask;
 
 struct VPlanTransforms {
   /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
index f2e3b708d7820..61da142ad376c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -1,6 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1
 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
+; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF
+; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1
 
 target triple = "aarch64-unknown-linux"
 
@@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   br label %for.body
 
@@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-UF4:       middle.block:
 ;
+; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double(
+; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-TF-NEXT:  entry:
+; CHECK-TF-NEXT:    [[CMP6:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-TF-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK-TF:       for.body.preheader:
+; CHECK-TF-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-TF:       vector.ph:
+; CHECK-TF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-TF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-TF-NEXT:    [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
+; CHECK-TF-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-TF-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
+; CHECK-TF-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
+; CHECK-TF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-TF:       vector.body:
+; CHECK-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
+; CHECK-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison)
+; CHECK-TF-NEXT:    [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00)
+; CHECK-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-TF-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-TF-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
+; CHECK-TF-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-TF-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; CHECK-TF-NEXT:    [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
+; CHECK-TF-NEXT:    [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
+; CHECK-TF-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
+; CHECK-TF-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
+; CHECK-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-TF:       middle.block:
+;
 entry:
   %cmp6 = icmp sgt i64 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -243,14 +334,3 @@ for.end:
 
 attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
 
-;.
-; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.
-; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-;.

From 6e5f2775857ea25d44879e339dd58a018f5fdc02 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 11 Nov 2025 11:52:39 +0000
Subject: [PATCH 95/97] [X86] bittest-big-integer.ll - add test showing missed
 RMW fold because the load is hidden behind a bitcast (#167491)

---
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 202 ++++++++++++++++++-
 1 file changed, 194 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 9d31c298bfb9e..e9e9ee9c97593 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
 
@@ -956,6 +956,192 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
   ret i1 %cmp
 }
 
+; Load hidden behind bitcast
+define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i128_bitcast:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movzbl 16(%ebp), %ecx
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %esi
+; X86-NEXT:    movl 60(%esp,%eax), %ebx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    movzwl 14(%edx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    movzwl 12(%edx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    movl 52(%esp,%eax), %edx
+; X86-NEXT:    movzbl 16(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movzwl 10(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $16, %ebx
+; X86-NEXT:    movzwl 8(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl 48(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movzwl 6(%ecx), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movzwl 4(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movzbl 16(%ebp), %ecx
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movzwl 2(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %edi, 12(%ecx)
+; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 14(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 10(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 8(%eax)
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movw %dx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; SSE2-LABEL: complement_ne_i128_bitcast:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl %esi, %ecx
+; SSE2-NEXT:    movl $1, %eax
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    shldq %cl, %rax, %rdx
+; SSE2-NEXT:    xorl %esi, %esi
+; SSE2-NEXT:    shlq %cl, %rax
+; SSE2-NEXT:    testb $64, %cl
+; SSE2-NEXT:    cmovneq %rax, %rdx
+; SSE2-NEXT:    cmovneq %rsi, %rax
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    xorq %rdx, 8(%rdi)
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: complement_ne_i128_bitcast:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movl %esi, %ecx
+; SSE4-NEXT:    movl $1, %eax
+; SSE4-NEXT:    xorl %edx, %edx
+; SSE4-NEXT:    shldq %cl, %rax, %rdx
+; SSE4-NEXT:    shlq %cl, %rax
+; SSE4-NEXT:    xorl %esi, %esi
+; SSE4-NEXT:    testb $64, %cl
+; SSE4-NEXT:    cmovneq %rax, %rdx
+; SSE4-NEXT:    cmovneq %rsi, %rax
+; SSE4-NEXT:    movdqa (%rdi), %xmm0
+; SSE4-NEXT:    movq %xmm0, %rcx
+; SSE4-NEXT:    xorq %rax, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    xorq %rdx, %rax
+; SSE4-NEXT:    movq %rax, 8(%rdi)
+; SSE4-NEXT:    movq %rcx, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX2-LABEL: complement_ne_i128_bitcast:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    movl $1, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    shldq %cl, %rax, %rdx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shlxq %rcx, %rax, %rax
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %rax, %rdx
+; AVX2-NEXT:    cmovneq %rsi, %rax
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    xorq %rax, %rcx
+; AVX2-NEXT:    xorq %rdx, %rsi
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: complement_ne_i128_bitcast:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movl $1, %edx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %rdx, %rsi
+; AVX512-NEXT:    cmovneq %rax, %rdx
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    xorq %rdx, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    xorq %rsi, %rcx
+; AVX512-NEXT:    movq %rcx, 8(%rdi)
+; AVX512-NEXT:    movq %rax, (%rdi)
+; AVX512-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ldv = load <8 x i16>, ptr %word
+  %ld = bitcast <8 x i16> %ldv to i128
+  %test = and i128 %ld, %bit
+  %res = xor i128 %ld, %bit
+  store i128 %res, ptr %word
+  ret <8 x i16> %ldv
+}
+
 ; Multiple loads in store chain
 define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-LABEL: reset_multiload_i128:
@@ -975,10 +1161,10 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    btrl %edx, %ebx
 ; X86-NEXT:    btl %edx, %edi
 ; X86-NEXT:    movl %ebx, (%ecx,%esi)
-; X86-NEXT:    jae .LBB22_2
+; X86-NEXT:    jae .LBB23_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB22_2:
+; X86-NEXT:  .LBB23_2:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -994,10 +1180,10 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X64-NEXT:    btrl %esi, %r8d
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    btl %esi, %r9d
-; X64-NEXT:    jb .LBB22_2
+; X64-NEXT:    jb .LBB23_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    movl (%rdx), %eax
-; X64-NEXT:  .LBB22_2:
+; X64-NEXT:  .LBB23_2:
 ; X64-NEXT:    movl %r8d, (%rdi,%rcx)
 ; X64-NEXT:    retq
   %rem = and i32 %position, 127
@@ -1046,10 +1232,10 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    jne .LBB23_2
+; X86-NEXT:    jne .LBB24_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:  .LBB23_2:
+; X86-NEXT:  .LBB24_2:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

From b440fb758477e758a640842e0de9baac8616d822 Mon Sep 17 00:00:00 2001
From: Weibo He <NewSigma@163.com>
Date: Tue, 11 Nov 2025 19:56:17 +0800
Subject: [PATCH 96/97] [Clang][CUDA] Replace inline asm in __trap() with
 intrinsic (#166152)

This patch proposes replacing the inline assembly with an intrinsic and
adds the `noreturn` to avoid warnings when the caller is also
`noreturn`.
---
 clang/lib/Headers/__clang_cuda_device_functions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index 86123727a1bc3..0226fe95abab6 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -528,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
-__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
+__DEVICE__ __attribute__((noreturn)) void __trap(void) { __builtin_trap(); }
 __DEVICE__ unsigned short
 __usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
   return __nvvm_atom_cas_gen_us(__p, __cmp, __v);

From 300750d4bea3fc2a17de13aa26f71aa10f2f5d2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com>
Date: Tue, 11 Nov 2025 13:57:54 +0200
Subject: [PATCH 97/97] [MLIR][XeGPU][TransformOps] Add set_gpu_launch_threads
 op (#166865)

Adds `transform.xegpu.set_gpu_launch_threads` that overrides `gpu.launch` operation threads.
---
 .../XeGPU/TransformOps/XeGPUTransformOps.td   | 39 +++++++++++
 .../XeGPU/TransformOps/XeGPUTransformOps.cpp  | 64 +++++++++++++++++++
 mlir/python/mlir/dialects/transform/xegpu.py  | 36 +++++++++++
 .../Dialect/XeGPU/transform-ops-invalid.mlir  | 53 +++++++++++++++
 mlir/test/Dialect/XeGPU/transform-ops.mlir    | 56 ++++++++++++++++
 .../python/dialects/transform_xegpu_ext.py    | 15 +++++
 6 files changed, 263 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
index 34f333e556deb..f5e4afad535e5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
@@ -161,4 +161,43 @@ def SetOpLayoutAttrOp : Op<Transform_Dialect, "xegpu.set_op_layout_attr", [
   }];
 }
 
+def SetGPULaunchThreadsOp
+    : Op<Transform_Dialect, "xegpu.set_gpu_launch_threads", [
+      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+      TransformOpInterface
+    ]> {
+
+  let summary = "Set number of threads for a given gpu.launch operation";
+  let description = [{
+    Overrides the x,y,z threads operands of a given `gpu.launch` operation in-place.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$threads,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_threads
+                   );
+  let results = (outs);
+  let builders = [
+    OpBuilder<(ins "Value":$target, "ArrayRef<OpFoldResult>":$mixedThreads)>,
+  ];
+
+  let assemblyFormat = [{
+    $target
+    `threads` `=` custom<DynamicIndexList>($threads, $static_threads)
+    attr-dict `:` qualified(type(operands))
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedThreads() {
+      Builder b(getContext());
+      return getMixedValues(getStaticThreads(), getThreads(), b);
+    }
+  }];
+}
+
 #endif // XEGPU_TRANSFORM_OPS
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
index 5fdd8534e4e51..7a7a8c9066f09 100644
--- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
@@ -341,6 +342,69 @@ void transform::SetOpLayoutAttrOp::getEffects(
   modifiesPayload(effects);
 }
 
+void transform::SetGPULaunchThreadsOp::build(
+    OpBuilder &builder, OperationState &ostate, Value target,
+    ArrayRef<OpFoldResult> mixedThreads) {
+  SmallVector<int64_t> staticThreads;
+  SmallVector<Value> dynamicThreads;
+  dispatchIndexOpFoldResults(mixedThreads, dynamicThreads, staticThreads);
+  build(builder, ostate, target.getType(),
+        /*target=*/target,
+        /*threads=*/dynamicThreads,
+        /*static_threads=*/staticThreads);
+}
+
+DiagnosedSilenceableFailure
+transform::SetGPULaunchThreadsOp::apply(transform::TransformRewriter &rewriter,
+                                        transform::TransformResults &results,
+                                        transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "Requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  auto launchOp = dyn_cast<gpu::LaunchOp>(target);
+  if (!launchOp) {
+    auto diag = emitSilenceableFailure(getLoc())
+                << "Expected a gpu.launch op, but got: " << target->getName();
+    diag.attachNote(target->getLoc()) << "target op";
+    return diag;
+  }
+
+  SmallVector<int32_t> threads;
+  DiagnosedSilenceableFailure status =
+      convertMixedValuesToInt(state, (*this), threads, getMixedThreads());
+  if (!status.succeeded())
+    return status;
+
+  if (threads.size() != 3) {
+    return emitSilenceableFailure(getLoc())
+           << "Expected threads argument to consist of three values (got "
+           << threads.size() << ")";
+  }
+
+  rewriter.setInsertionPoint(launchOp);
+  auto createConstValue = [&](int value) {
+    return arith::ConstantIndexOp::create(rewriter, launchOp.getLoc(), value);
+  };
+
+  // Replace threads in-place.
+  launchOp.getBlockSizeXMutable().assign(createConstValue(threads[0]));
+  launchOp.getBlockSizeYMutable().assign(createConstValue(threads[1]));
+  launchOp.getBlockSizeZMutable().assign(createConstValue(threads[2]));
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetGPULaunchThreadsOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getThreadsMutable(), effects);
+  modifiesPayload(effects);
+}
+
 namespace {
 class XeGPUTransformDialectExtension
     : public transform::TransformDialectExtension<
diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py
index ce8015d8f557b..309883cfc4518 100644
--- a/mlir/python/mlir/dialects/transform/xegpu.py
+++ b/mlir/python/mlir/dialects/transform/xegpu.py
@@ -132,3 +132,39 @@ def __init__(
             loc=loc,
             ip=ip,
         )
+
+
+class SetGPULaunchThreadsOp(SetGPULaunchThreadsOp):
+    """Specialization for SetGPULaunchThreadsOp class."""
+
+    def __init__(
+        self,
+        launch_op: Union[Operation, Value],
+        threads: MixedValues,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        (
+            dynamic_threads,
+            static_threads,
+            _,
+        ) = _dispatch_dynamic_index_list(threads)
+
+        super().__init__(
+            _get_op_result_or_value(launch_op),
+            dynamic_threads,
+            static_threads=static_threads,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def set_gpu_launch_threads(
+    launch_op: Union[Operation, Value],
+    threads: MixedValues,
+    *,
+    loc=None,
+    ip=None,
+) -> SetGPULaunchThreadsOp:
+    return SetGPULaunchThreadsOp(launch_op, threads, loc=loc, ip=ip)
diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
index 726b6748452ae..24f500658f740 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
@@ -71,3 +71,56 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @set_gpu_launch_threads_bad_handle(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index // expected-note {{target op}}
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected a gpu.launch op, but got: arith.constant}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @set_gpu_launch_threads_many_handles(%arg0: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Requires exactly one targetOp handle (got 2)}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @set_gpu_launch_threads_bad_threads(%arg0: memref<4096x4096xf16>) {
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Expected threads argument to consist of three values (got 2)}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
index bd6a79244ed30..7f2fbe4271a43 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -230,6 +230,7 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
 // -----
 
 // CHECK-LABEL: @set_op_layout_attr_operand1
@@ -252,3 +253,58 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: @set_gpu_launch_threads
+func.func @set_gpu_launch_threads(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[C1:.+]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[C16:.+]] = arith.constant 16 : index
+  %c16 = arith.constant 16 : index
+  // CHECK: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK: %[[C1_0:.+]] = arith.constant 1 : index
+  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]])
+  // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]])
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}}
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_gpu_launch_threads_param
+func.func @set_gpu_launch_threads_param(%arg0: memref<4096x4096xf16>) {
+  // CHECK: %[[C1:.+]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[C16:.+]] = arith.constant 16 : index
+  %c16 = arith.constant 16 : index
+  // CHECK: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK: %[[C1_0:.+]] = arith.constant 1 : index
+  // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]])
+  // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]])
+  gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) {
+    gpu.terminator
+  }
+  return
+}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}}
+    %th1 = transform.param.constant 4 : i64 -> !transform.param<i64>
+    transform.xegpu.set_gpu_launch_threads %0 threads = [8, %th1, 1] : !transform.any_op, !transform.param<i64>
+    transform.yield
+  }
+}
diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py
index 0b587d2020aa6..dc91f5e982579 100644
--- a/mlir/test/python/dialects/transform_xegpu_ext.py
+++ b/mlir/test/python/dialects/transform_xegpu_ext.py
@@ -113,3 +113,18 @@ def setOpLayoutAttrResult():
     # CHECK: sg_layout = [6, 4]
     # CHECK: sg_data = [32, 16]
     # CHECK: inst_data = [8, 16]
+
+
+@run
+def setGPULaunchThreadsOp():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("gpu.launch"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.set_gpu_launch_threads(sequence.bodyTarget, threads=[8, 4, 1])
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setGPULaunchThreadsOp
+    # CHECK: transform.xegpu.set_gpu_launch_threads
+    # CHECK: threads = [8, 4, 1]