diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 24f8704967d46..098d36f162205 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -7,10 +7,9 @@
 # to receive an approval from a "code owner" in particular -- any LLVM project
 # member can approve pull requests.
 #
-# Note that GitHub's concept of "code owner" is independent from LLVM's own
-# "code owner" concept, they merely happen to share terminology. See
-# https://llvm.org/docs/DeveloperPolicy.html#code-owners, as well as the
-# CODE_OWNERS.txt files in the respective subproject directories.
+# This is independent of LLVM's own "maintainer" concept.
+# See https://llvm.org/docs/DeveloperPolicy.html#maintainers as well as the
+# Maintainers.* files in the the respective subproject directories.
 
 /libcxx/ @llvm/reviewers-libcxx
 /libcxxabi/ @llvm/reviewers-libcxxabi
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 2e4009994a2b0..fe9c70cf1f5a9 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -162,7 +162,7 @@ jobs:
           cmake -B libc-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libc" -DLLVM_ENABLE_SPHINX=ON ./runtimes
           TZ=UTC ninja -C libc-build docs-libc-html
           mkdir built-docs/libc
-          cp -r libc-build/docs/* built-docs/libc/
+          cp -r libc-build/libc/docs/* built-docs/libc/
       - name: Build LLD docs
         if: steps.docs-changed-subprojects.outputs.lld_any_changed == 'true'
         run: |
diff --git a/bolt/lib/Passes/VeneerElimination.cpp b/bolt/lib/Passes/VeneerElimination.cpp
index b386b2756a2b8..99d0ffeca8cc2 100644
--- a/bolt/lib/Passes/VeneerElimination.cpp
+++ b/bolt/lib/Passes/VeneerElimination.cpp
@@ -46,16 +46,17 @@ Error VeneerElimination::runOnFunctions(BinaryContext &BC) {
     if (BF.isIgnored())
       continue;
 
+    MCInst &FirstInstruction = *(BF.begin()->begin());
     const MCSymbol *VeneerTargetSymbol = 0;
     uint64_t TargetAddress;
-    if (BC.MIB->matchAbsLongVeneer(BF, TargetAddress)) {
+    if (BC.MIB->isTailCall(FirstInstruction)) {
+      VeneerTargetSymbol = BC.MIB->getTargetSymbol(FirstInstruction);
+    } else if (BC.MIB->matchAbsLongVeneer(BF, TargetAddress)) {
       if (BinaryFunction *TargetBF =
               BC.getBinaryFunctionAtAddress(TargetAddress))
         VeneerTargetSymbol = TargetBF->getSymbol();
-    } else {
-      MCInst &FirstInstruction = *(BF.begin()->begin());
-      if (BC.MIB->hasAnnotation(FirstInstruction, "AArch64Veneer"))
-        VeneerTargetSymbol = BC.MIB->getTargetSymbol(FirstInstruction, 1);
+    } else if (BC.MIB->hasAnnotation(FirstInstruction, "AArch64Veneer")) {
+      VeneerTargetSymbol = BC.MIB->getTargetSymbol(FirstInstruction, 1);
     }
 
     if (!VeneerTargetSymbol)
diff --git a/bolt/test/AArch64/veneer-lld-abs.s b/bolt/test/AArch64/veneer-lld-abs.s
index d10ff46e2cb01..7e6fe2d127060 100644
--- a/bolt/test/AArch64/veneer-lld-abs.s
+++ b/bolt/test/AArch64/veneer-lld-abs.s
@@ -1,5 +1,5 @@
-## Check that llvm-bolt correctly recognizes long absolute thunks generated
-## by LLD.
+## Check that llvm-bolt correctly recognizes veneers/thunks for absolute code
+## generated by LLD.
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags -fno-PIC -no-pie %t.o -o %t.exe -nostdlib \
@@ -12,40 +12,63 @@
 
 .text
 .balign 4
-.global foo
-.type foo, %function
-foo:
-  adrp x1, foo
+.global far_function
+.type far_function, %function
+far_function:
   ret
-.size foo, .-foo
+.size far_function, .-far_function
+
+.global near_function
+.type near_function, %function
+near_function:
+  ret
+.size near_function, .-near_function
+
+## Force relocations against .text.
+.reloc 0, R_AARCH64_NONE
 
 .section ".mytext", "ax"
 .balign 4
 
-.global __AArch64AbsLongThunk_foo
-.type __AArch64AbsLongThunk_foo, %function
-__AArch64AbsLongThunk_foo:
+## This version of a thunk is always generated by LLD for function calls
+## spanning more than 256MB.
+.global __AArch64AbsLongThunk_far_function
+.type __AArch64AbsLongThunk_far_function, %function
+__AArch64AbsLongThunk_far_function:
   ldr x16, .L1
   br x16
-# CHECK-INPUT-LABEL: <__AArch64AbsLongThunk_foo>:
+# CHECK-INPUT-LABEL: <__AArch64AbsLongThunk_far_function>:
 # CHECK-INPUT-NEXT:    ldr
 # CHECK-INPUT-NEXT:    br
 .L1:
-  .quad foo
-.size __AArch64AbsLongThunk_foo, .-__AArch64AbsLongThunk_foo
+  .quad far_function
+.size __AArch64AbsLongThunk_far_function, .-__AArch64AbsLongThunk_far_function
+
+## If a callee is closer than 256MB away, LLD may generate a thunk with a direct
+## jump to the callee. Note, that the name might still include "AbsLong".
+.global __AArch64AbsLongThunk_near_function
+.type __AArch64AbsLongThunk_near_function, %function
+__AArch64AbsLongThunk_near_function:
+  b near_function
+# CHECK-INPUT-LABEL: <__AArch64AbsLongThunk_near_function>:
+# CHECK-INPUT-NEXT:    b {{.*}} <near_function>
+.size __AArch64AbsLongThunk_near_function, .-__AArch64AbsLongThunk_near_function
 
-## Check that the thunk was removed from .text and _start() calls foo()
+## Check that thunks were removed from .text, and _start calls functions
 ## directly.
 
-# CHECK-OUTPUT-NOT: __AArch64AbsLongThunk_foo
+# CHECK-OUTPUT-NOT: __AArch64AbsLongThunk_{{.*}}
 
 .global _start
 .type _start, %function
 _start:
 # CHECK-INPUT-LABEL:  <_start>:
 # CHECK-OUTPUT-LABEL: <_start>:
-  bl __AArch64AbsLongThunk_foo
-# CHECK-INPUT-NEXT:     bl {{.*}} <__AArch64AbsLongThunk_foo>
-# CHECK-OUTPUT-NEXT:    bl {{.*}} <foo>
+  bl __AArch64AbsLongThunk_far_function
+  bl __AArch64AbsLongThunk_near_function
+# CHECK-INPUT-NEXT:     bl {{.*}} <__AArch64AbsLongThunk_far_function>
+# CHECK-INPUT-NEXT:     bl {{.*}} <__AArch64AbsLongThunk_near_function>
+# CHECK-OUTPUT-NEXT:    bl {{.*}} <far_function>
+# CHECK-OUTPUT-NEXT:    bl {{.*}} <near_function>
   ret
 .size _start, .-_start
diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
index 7da27c0474d51..1bd7abbad66d2 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ReturnConstRefFromParameterCheck.h"
+#include "clang/AST/Attrs.inc"
 #include "clang/AST/Expr.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -15,6 +16,14 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
+namespace {
+
+AST_MATCHER(ParmVarDecl, hasLifetimeBoundAttr) {
+  return Node.hasAttr<LifetimeBoundAttr>();
+}
+
+} // namespace
+
 void ReturnConstRefFromParameterCheck::registerMatchers(MatchFinder *Finder) {
   const auto DRef = ignoringParens(
       declRefExpr(
@@ -22,7 +31,8 @@ void ReturnConstRefFromParameterCheck::registerMatchers(MatchFinder *Finder) {
                              qualType(lValueReferenceType(pointee(
                                           qualType(isConstQualified()))))
                                  .bind("type"))),
-                         hasDeclContext(functionDecl().bind("owner")))
+                         hasDeclContext(functionDecl().bind("owner")),
+                         unless(hasLifetimeBoundAttr()))
                  .bind("param")))
           .bind("dref"));
   const auto Func =
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 05dd313d0a0d3..1e981825c7c15 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -1415,6 +1415,12 @@ void ClangdLSPServer::onInlayHint(const InlayHintsParams &Params,
                      std::move(Reply));
 }
 
+void ClangdLSPServer::onCallHierarchyOutgoingCalls(
+    const CallHierarchyOutgoingCallsParams &Params,
+    Callback<std::vector<CallHierarchyOutgoingCall>> Reply) {
+  Server->outgoingCalls(Params.item, std::move(Reply));
+}
+
 void ClangdLSPServer::applyConfiguration(
     const ConfigurationSettings &Settings) {
   // Per-file update to the compilation database.
@@ -1693,6 +1699,8 @@ void ClangdLSPServer::bindMethods(LSPBinder &Bind,
   Bind.method("typeHierarchy/subtypes", this, &ClangdLSPServer::onSubTypes);
   Bind.method("textDocument/prepareCallHierarchy", this, &ClangdLSPServer::onPrepareCallHierarchy);
   Bind.method("callHierarchy/incomingCalls", this, &ClangdLSPServer::onCallHierarchyIncomingCalls);
+  if (Opts.EnableOutgoingCalls)
+    Bind.method("callHierarchy/outgoingCalls", this, &ClangdLSPServer::onCallHierarchyOutgoingCalls);
   Bind.method("textDocument/selectionRange", this, &ClangdLSPServer::onSelectionRange);
   Bind.method("textDocument/documentLink", this, &ClangdLSPServer::onDocumentLink);
   Bind.method("textDocument/semanticTokens/full", this, &ClangdLSPServer::onSemanticTokens);
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.h b/clang-tools-extra/clangd/ClangdLSPServer.h
index 0b8e4720f5323..597fd9de7ff68 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.h
+++ b/clang-tools-extra/clangd/ClangdLSPServer.h
@@ -156,6 +156,9 @@ class ClangdLSPServer : private ClangdServer::Callbacks,
   void onCallHierarchyIncomingCalls(
       const CallHierarchyIncomingCallsParams &,
       Callback<std::vector<CallHierarchyIncomingCall>>);
+  void onCallHierarchyOutgoingCalls(
+      const CallHierarchyOutgoingCallsParams &,
+      Callback<std::vector<CallHierarchyOutgoingCall>>);
   void onClangdInlayHints(const InlayHintsParams &,
                           Callback<llvm::json::Value>);
   void onInlayHint(const InlayHintsParams &, Callback<std::vector<InlayHint>>);
diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 9b38be04e7ddd..52be15d3da936 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -215,7 +215,9 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB,
                            const ThreadsafeFS &TFS, const Options &Opts,
                            Callbacks *Callbacks)
     : FeatureModules(Opts.FeatureModules), CDB(CDB), TFS(TFS),
-      DynamicIdx(Opts.BuildDynamicSymbolIndex ? new FileIndex() : nullptr),
+      DynamicIdx(Opts.BuildDynamicSymbolIndex
+                     ? new FileIndex(Opts.EnableOutgoingCalls)
+                     : nullptr),
       ModulesManager(Opts.ModulesManager),
       ClangTidyProvider(Opts.ClangTidyProvider),
       UseDirtyHeaders(Opts.UseDirtyHeaders),
@@ -256,6 +258,7 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB,
         Callbacks->onBackgroundIndexProgress(S);
     };
     BGOpts.ContextProvider = Opts.ContextProvider;
+    BGOpts.SupportContainedRefs = Opts.EnableOutgoingCalls;
     BackgroundIdx = std::make_unique<BackgroundIndex>(
         TFS, CDB,
         BackgroundIndexStorage::createDiskBackedStorageFactory(
@@ -912,6 +915,15 @@ void ClangdServer::inlayHints(PathRef File, std::optional<Range> RestrictRange,
   WorkScheduler->runWithAST("InlayHints", File, std::move(Action), Transient);
 }
 
+void ClangdServer::outgoingCalls(
+    const CallHierarchyItem &Item,
+    Callback<std::vector<CallHierarchyOutgoingCall>> CB) {
+  WorkScheduler->run("Outgoing Calls", "",
+                     [CB = std::move(CB), Item, this]() mutable {
+                       CB(clangd::outgoingCalls(Item, Index));
+                     });
+}
+
 void ClangdServer::onFileEvent(const DidChangeWatchedFilesParams &Params) {
   // FIXME: Do nothing for now. This will be used for indexing and potentially
   // invalidating other caches.
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index a653cdb56b751..e030bf04122d5 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -110,6 +110,11 @@ class ClangdServer {
     /// Cached preambles are potentially large. If false, store them on disk.
     bool StorePreamblesInMemory = true;
 
+    /// Call hierarchy's outgoing calls feature requires additional index
+    /// serving structures which increase memory usage. If false, these are
+    /// not created and the feature is not enabled.
+    bool EnableOutgoingCalls = true;
+
     /// This throttler controls which preambles may be built at a given time.
     clangd::PreambleThrottler *PreambleThrottler = nullptr;
 
@@ -292,6 +297,10 @@ class ClangdServer {
   void incomingCalls(const CallHierarchyItem &Item,
                      Callback<std::vector<CallHierarchyIncomingCall>>);
 
+  /// Resolve outgoing calls for a given call hierarchy item.
+  void outgoingCalls(const CallHierarchyItem &Item,
+                     Callback<std::vector<CallHierarchyOutgoingCall>>);
+
   /// Resolve inlay hints for a given document.
   void inlayHints(PathRef File, std::optional<Range> RestrictRange,
                   Callback<std::vector<InlayHint>>);
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 61fa66180376c..8295b9d541f7e 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -1702,6 +1702,7 @@ declToHierarchyItem(const NamedDecl &ND, llvm::StringRef TUPath) {
 
   HierarchyItem HI;
   HI.name = printName(Ctx, ND);
+  // FIXME: Populate HI.detail the way we do in symbolToHierarchyItem?
   HI.kind = SK;
   HI.range = Range{sourceLocToPosition(SM, DeclRange->getBegin()),
                    sourceLocToPosition(SM, DeclRange->getEnd())};
@@ -1753,6 +1754,7 @@ static std::optional<HierarchyItem> symbolToHierarchyItem(const Symbol &S,
   }
   HierarchyItem HI;
   HI.name = std::string(S.Name);
+  HI.detail = (S.Scope + S.Name).str();
   HI.kind = indexSymbolKindToSymbolKind(S.SymInfo.Kind);
   HI.selectionRange = Loc->range;
   // FIXME: Populate 'range' correctly
@@ -2319,6 +2321,65 @@ incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
   return Results;
 }
 
+std::vector<CallHierarchyOutgoingCall>
+outgoingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
+  std::vector<CallHierarchyOutgoingCall> Results;
+  if (!Index || Item.data.empty())
+    return Results;
+  auto ID = SymbolID::fromStr(Item.data);
+  if (!ID) {
+    elog("outgoingCalls failed to find symbol: {0}", ID.takeError());
+    return Results;
+  }
+  // In this function, we find outgoing calls based on the index only.
+  ContainedRefsRequest Request;
+  Request.ID = *ID;
+  // Initially store the ranges in a map keyed by SymbolID of the callee.
+  // This allows us to group different calls to the same function
+  // into the same CallHierarchyOutgoingCall.
+  llvm::DenseMap<SymbolID, std::vector<Range>> CallsOut;
+  // We can populate the ranges based on a refs request only. As we do so, we
+  // also accumulate the callee IDs into a lookup request.
+  LookupRequest CallsOutLookup;
+  Index->containedRefs(Request, [&](const auto &R) {
+    auto Loc = indexToLSPLocation(R.Location, Item.uri.file());
+    if (!Loc) {
+      elog("outgoingCalls failed to convert location: {0}", Loc.takeError());
+      return;
+    }
+    auto It = CallsOut.try_emplace(R.Symbol, std::vector<Range>{}).first;
+    It->second.push_back(Loc->range);
+
+    CallsOutLookup.IDs.insert(R.Symbol);
+  });
+  // Perform the lookup request and combine its results with CallsOut to
+  // get complete CallHierarchyOutgoingCall objects.
+  Index->lookup(CallsOutLookup, [&](const Symbol &Callee) {
+    // The containedRefs request should only return symbols which are
+    // function-like, i.e. symbols for which references to them can be "calls".
+    using SK = index::SymbolKind;
+    auto Kind = Callee.SymInfo.Kind;
+    assert(Kind == SK::Function || Kind == SK::InstanceMethod ||
+           Kind == SK::ClassMethod || Kind == SK::StaticMethod ||
+           Kind == SK::Constructor || Kind == SK::Destructor ||
+           Kind == SK::ConversionFunction);
+    (void)Kind;
+    (void)SK::Function;
+
+    auto It = CallsOut.find(Callee.ID);
+    assert(It != CallsOut.end());
+    if (auto CHI = symbolToCallHierarchyItem(Callee, Item.uri.file()))
+      Results.push_back(
+          CallHierarchyOutgoingCall{std::move(*CHI), std::move(It->second)});
+  });
+  // Sort results by name of the callee.
+  llvm::sort(Results, [](const CallHierarchyOutgoingCall &A,
+                         const CallHierarchyOutgoingCall &B) {
+    return A.to.name < B.to.name;
+  });
+  return Results;
+}
+
 llvm::DenseSet<const Decl *> getNonLocalDeclRefs(ParsedAST &AST,
                                                  const FunctionDecl *FD) {
   if (!FD->hasBody())
diff --git a/clang-tools-extra/clangd/XRefs.h b/clang-tools-extra/clangd/XRefs.h
index df91dd15303c1..247e52314c3f9 100644
--- a/clang-tools-extra/clangd/XRefs.h
+++ b/clang-tools-extra/clangd/XRefs.h
@@ -150,6 +150,9 @@ prepareCallHierarchy(ParsedAST &AST, Position Pos, PathRef TUPath);
 std::vector<CallHierarchyIncomingCall>
 incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index);
 
+std::vector<CallHierarchyOutgoingCall>
+outgoingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index);
+
 /// Returns all decls that are referenced in the \p FD except local symbols.
 llvm::DenseSet<const Decl *> getNonLocalDeclRefs(ParsedAST &AST,
                                                  const FunctionDecl *FD);
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index 5cde4937fee78..496d1455def4b 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -96,7 +96,7 @@ BackgroundIndex::BackgroundIndex(
     : SwapIndex(std::make_unique<MemIndex>()), TFS(TFS), CDB(CDB),
       IndexingPriority(Opts.IndexingPriority),
       ContextProvider(std::move(Opts.ContextProvider)),
-      IndexedSymbols(IndexContents::All),
+      IndexedSymbols(IndexContents::All, Opts.SupportContainedRefs),
       Rebuilder(this, &IndexedSymbols, Opts.ThreadPoolSize),
       IndexStorageFactory(std::move(IndexStorageFactory)),
       Queue(std::move(Opts.OnProgress)),
diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h
index 0d719ffdb957e..448e911201575 100644
--- a/clang-tools-extra/clangd/index/Background.h
+++ b/clang-tools-extra/clangd/index/Background.h
@@ -145,6 +145,9 @@ class BackgroundIndex : public SwapIndex {
     // file. Called with the empty string for other tasks.
     // (When called, the context from BackgroundIndex construction is active).
     std::function<Context(PathRef)> ContextProvider = nullptr;
+    // Whether the index needs to support the containedRefs() operation.
+    // May use extra memory.
+    bool SupportContainedRefs = true;
   };
 
   /// Creates a new background index and starts its threads.
diff --git a/clang-tools-extra/clangd/index/BackgroundRebuild.cpp b/clang-tools-extra/clangd/index/BackgroundRebuild.cpp
index 79383be012f83..4dc2d3b1d059b 100644
--- a/clang-tools-extra/clangd/index/BackgroundRebuild.cpp
+++ b/clang-tools-extra/clangd/index/BackgroundRebuild.cpp
@@ -1,4 +1,4 @@
-//===-- BackgroundRebuild.cpp - when to rebuild thei background index -----===//
+//===-- BackgroundRebuild.cpp - when to rebuild the background index ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang-tools-extra/clangd/index/FileIndex.cpp b/clang-tools-extra/clangd/index/FileIndex.cpp
index eb9562d2b6bf8..aa573e312a756 100644
--- a/clang-tools-extra/clangd/index/FileIndex.cpp
+++ b/clang-tools-extra/clangd/index/FileIndex.cpp
@@ -239,8 +239,8 @@ SlabTuple indexHeaderSymbols(llvm::StringRef Version, ASTContext &AST,
                       /*CollectMainFileRefs=*/false);
 }
 
-FileSymbols::FileSymbols(IndexContents IdxContents)
-    : IdxContents(IdxContents) {}
+FileSymbols::FileSymbols(IndexContents IdxContents, bool SupportContainedRefs)
+    : IdxContents(IdxContents), SupportContainedRefs(SupportContainedRefs) {}
 
 void FileSymbols::update(llvm::StringRef Key,
                          std::unique_ptr<SymbolSlab> Symbols,
@@ -395,7 +395,7 @@ FileSymbols::buildIndex(IndexType Type, DuplicateHandling DuplicateHandle,
         std::move(AllRelations), std::move(Files), IdxContents,
         std::make_tuple(std::move(SymbolSlabs), std::move(RefSlabs),
                         std::move(RefsStorage), std::move(SymsStorage)),
-        StorageSize);
+        StorageSize, SupportContainedRefs);
   }
   llvm_unreachable("Unknown clangd::IndexType");
 }
@@ -419,11 +419,12 @@ void FileSymbols::profile(MemoryTree &MT) const {
   }
 }
 
-FileIndex::FileIndex()
+FileIndex::FileIndex(bool SupportContainedRefs)
     : MergedIndex(&MainFileIndex, &PreambleIndex),
-      PreambleSymbols(IndexContents::Symbols | IndexContents::Relations),
+      PreambleSymbols(IndexContents::Symbols | IndexContents::Relations,
+                      SupportContainedRefs),
       PreambleIndex(std::make_unique<MemIndex>()),
-      MainFileSymbols(IndexContents::All),
+      MainFileSymbols(IndexContents::All, SupportContainedRefs),
       MainFileIndex(std::make_unique<MemIndex>()) {}
 
 void FileIndex::updatePreamble(IndexFileIn IF) {
diff --git a/clang-tools-extra/clangd/index/FileIndex.h b/clang-tools-extra/clangd/index/FileIndex.h
index 44f33e8fbcd51..8e88dc9712996 100644
--- a/clang-tools-extra/clangd/index/FileIndex.h
+++ b/clang-tools-extra/clangd/index/FileIndex.h
@@ -69,7 +69,7 @@ enum class DuplicateHandling {
 /// locking when we swap or obtain references to snapshots.
 class FileSymbols {
 public:
-  FileSymbols(IndexContents IdxContents);
+  FileSymbols(IndexContents IdxContents, bool SupportContainedRefs);
   /// Updates all slabs associated with the \p Key.
   /// If either is nullptr, corresponding data for \p Key will be removed.
   /// If CountReferences is true, \p Refs will be used for counting references
@@ -91,6 +91,7 @@ class FileSymbols {
 
 private:
   IndexContents IdxContents;
+  bool SupportContainedRefs;
 
   struct RefSlabAndCountReferences {
     std::shared_ptr<RefSlab> Slab;
@@ -108,7 +109,7 @@ class FileSymbols {
 /// FIXME: Expose an interface to remove files that are closed.
 class FileIndex : public MergedIndex {
 public:
-  FileIndex();
+  FileIndex(bool SupportContainedRefs);
 
   /// Update preamble symbols of file \p Path with all declarations in \p AST
   /// and macros in \p PP.
diff --git a/clang-tools-extra/clangd/index/Index.cpp b/clang-tools-extra/clangd/index/Index.cpp
index 7a0c23287db22..86dc6ed763344 100644
--- a/clang-tools-extra/clangd/index/Index.cpp
+++ b/clang-tools-extra/clangd/index/Index.cpp
@@ -66,6 +66,11 @@ bool SwapIndex::refs(const RefsRequest &R,
                      llvm::function_ref<void(const Ref &)> CB) const {
   return snapshot()->refs(R, CB);
 }
+bool SwapIndex::containedRefs(
+    const ContainedRefsRequest &R,
+    llvm::function_ref<void(const ContainedRefsResult &)> CB) const {
+  return snapshot()->containedRefs(R, CB);
+}
 void SwapIndex::relations(
     const RelationsRequest &R,
     llvm::function_ref<void(const SymbolID &, const Symbol &)> CB) const {
diff --git a/clang-tools-extra/clangd/index/Index.h b/clang-tools-extra/clangd/index/Index.h
index 047ce08e93e3a..a193b1a191216 100644
--- a/clang-tools-extra/clangd/index/Index.h
+++ b/clang-tools-extra/clangd/index/Index.h
@@ -77,6 +77,19 @@ struct RefsRequest {
   bool WantContainer = false;
 };
 
+struct ContainedRefsRequest {
+  /// Note that RefKind::Call just restricts the matched SymbolKind to
+  /// functions, not the form of the reference (e.g. address-of-function,
+  /// which can indicate an indirect call, should still be caught).
+  static const RefKind SupportedRefKinds = RefKind::Call;
+
+  SymbolID ID;
+  /// If set, limit the number of refers returned from the index. The index may
+  /// choose to return less than this, e.g. it tries to avoid returning stale
+  /// results.
+  std::optional<uint32_t> Limit;
+};
+
 struct RelationsRequest {
   llvm::DenseSet<SymbolID> Subjects;
   RelationKind Predicate;
@@ -84,6 +97,14 @@ struct RelationsRequest {
   std::optional<uint32_t> Limit;
 };
 
+struct ContainedRefsResult {
+  /// The source location where the symbol is named.
+  SymbolLocation Location;
+  RefKind Kind = RefKind::Unknown;
+  /// The ID of the symbol which is referred to
+  SymbolID Symbol;
+};
+
 /// Describes what data is covered by an index.
 ///
 /// Indexes may contain symbols but not references from a file, etc.
@@ -141,6 +162,17 @@ class SymbolIndex {
   virtual bool refs(const RefsRequest &Req,
                     llvm::function_ref<void(const Ref &)> Callback) const = 0;
 
+  /// Find all symbols that are referenced by a symbol and apply
+  /// \p Callback on each result.
+  ///
+  /// Results should be returned in arbitrary order.
+  /// The returned result must be deep-copied if it's used outside Callback.
+  ///
+  /// Returns true if there will be more results (limited by Req.Limit);
+  virtual bool containedRefs(
+      const ContainedRefsRequest &Req,
+      llvm::function_ref<void(const ContainedRefsResult &)> Callback) const = 0;
+
   /// Finds all relations (S, P, O) stored in the index such that S is among
   /// Req.Subjects and P is Req.Predicate, and invokes \p Callback for (S, O) in
   /// each.
@@ -175,6 +207,9 @@ class SwapIndex : public SymbolIndex {
               llvm::function_ref<void(const Symbol &)>) const override;
   bool refs(const RefsRequest &,
             llvm::function_ref<void(const Ref &)>) const override;
+  bool containedRefs(
+      const ContainedRefsRequest &,
+      llvm::function_ref<void(const ContainedRefsResult &)>) const override;
   void relations(const RelationsRequest &,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override;
diff --git a/clang-tools-extra/clangd/index/MemIndex.cpp b/clang-tools-extra/clangd/index/MemIndex.cpp
index 2665d46b97d83..9c9d3942bdee6 100644
--- a/clang-tools-extra/clangd/index/MemIndex.cpp
+++ b/clang-tools-extra/clangd/index/MemIndex.cpp
@@ -9,6 +9,7 @@
 #include "MemIndex.h"
 #include "FuzzyMatch.h"
 #include "Quality.h"
+#include "index/Index.h"
 #include "support/Trace.h"
 
 namespace clang {
@@ -85,6 +86,25 @@ bool MemIndex::refs(const RefsRequest &Req,
   return false; // We reported all refs.
 }
 
+bool MemIndex::containedRefs(
+    const ContainedRefsRequest &Req,
+    llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
+  trace::Span Tracer("MemIndex refersTo");
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  for (const auto &Pair : Refs) {
+    for (const auto &R : Pair.second) {
+      if (!static_cast<int>(ContainedRefsRequest::SupportedRefKinds & R.Kind) ||
+          Req.ID != R.Container)
+        continue;
+      if (Remaining == 0)
+        return true; // More refs were available.
+      --Remaining;
+      Callback({R.Location, R.Kind, Pair.first});
+    }
+  }
+  return false; // We reported all refs.
+}
+
 void MemIndex::relations(
     const RelationsRequest &Req,
     llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
diff --git a/clang-tools-extra/clangd/index/MemIndex.h b/clang-tools-extra/clangd/index/MemIndex.h
index fba2c1a7120a2..8f390c5028dc4 100644
--- a/clang-tools-extra/clangd/index/MemIndex.h
+++ b/clang-tools-extra/clangd/index/MemIndex.h
@@ -72,6 +72,10 @@ class MemIndex : public SymbolIndex {
   bool refs(const RefsRequest &Req,
             llvm::function_ref<void(const Ref &)> Callback) const override;
 
+  bool containedRefs(const ContainedRefsRequest &Req,
+                     llvm::function_ref<void(const ContainedRefsResult &)>
+                         Callback) const override;
+
   void relations(const RelationsRequest &Req,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>
                      Callback) const override;
diff --git a/clang-tools-extra/clangd/index/Merge.cpp b/clang-tools-extra/clangd/index/Merge.cpp
index 8221d4b1f4440..aecca38a885b6 100644
--- a/clang-tools-extra/clangd/index/Merge.cpp
+++ b/clang-tools-extra/clangd/index/Merge.cpp
@@ -155,6 +155,40 @@ bool MergedIndex::refs(const RefsRequest &Req,
   return More || StaticHadMore;
 }
 
+bool MergedIndex::containedRefs(
+    const ContainedRefsRequest &Req,
+    llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
+  trace::Span Tracer("MergedIndex refersTo");
+  bool More = false;
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  // We don't want duplicated refs from the static/dynamic indexes,
+  // and we can't reliably deduplicate them because offsets may differ slightly.
+  // We consider the dynamic index authoritative and report all its refs,
+  // and only report static index refs from other files.
+  More |= Dynamic->containedRefs(Req, [&](const auto &O) {
+    Callback(O);
+    assert(Remaining != 0);
+    --Remaining;
+  });
+  if (Remaining == 0 && More)
+    return More;
+  auto DynamicContainsFile = Dynamic->indexedFiles();
+  // We return less than Req.Limit if static index returns more refs for dirty
+  // files.
+  bool StaticHadMore = Static->containedRefs(Req, [&](const auto &O) {
+    if ((DynamicContainsFile(O.Location.FileURI) & IndexContents::References) !=
+        IndexContents::None)
+      return; // ignore refs that have been seen from dynamic index.
+    if (Remaining == 0) {
+      More = true;
+      return;
+    }
+    --Remaining;
+    Callback(O);
+  });
+  return More || StaticHadMore;
+}
+
 llvm::unique_function<IndexContents(llvm::StringRef) const>
 MergedIndex::indexedFiles() const {
   return [DynamicContainsFile{Dynamic->indexedFiles()},
diff --git a/clang-tools-extra/clangd/index/Merge.h b/clang-tools-extra/clangd/index/Merge.h
index b8a562b0df5d9..7441be6e57e85 100644
--- a/clang-tools-extra/clangd/index/Merge.h
+++ b/clang-tools-extra/clangd/index/Merge.h
@@ -38,6 +38,9 @@ class MergedIndex : public SymbolIndex {
               llvm::function_ref<void(const Symbol &)>) const override;
   bool refs(const RefsRequest &,
             llvm::function_ref<void(const Ref &)>) const override;
+  bool containedRefs(
+      const ContainedRefsRequest &,
+      llvm::function_ref<void(const ContainedRefsResult &)>) const override;
   void relations(const RelationsRequest &,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override;
diff --git a/clang-tools-extra/clangd/index/ProjectAware.cpp b/clang-tools-extra/clangd/index/ProjectAware.cpp
index 2c6f8273b35d0..9836f0130362a 100644
--- a/clang-tools-extra/clangd/index/ProjectAware.cpp
+++ b/clang-tools-extra/clangd/index/ProjectAware.cpp
@@ -35,6 +35,10 @@ class ProjectAwareIndex : public SymbolIndex {
   /// Query all indexes while prioritizing the associated one (if any).
   bool refs(const RefsRequest &Req,
             llvm::function_ref<void(const Ref &)> Callback) const override;
+  /// Query all indexes while prioritizing the associated one (if any).
+  bool containedRefs(const ContainedRefsRequest &Req,
+                     llvm::function_ref<void(const ContainedRefsResult &)>
+                         Callback) const override;
 
   /// Queries only the associates index when Req.RestrictForCodeCompletion is
   /// set, otherwise queries all.
@@ -94,6 +98,15 @@ bool ProjectAwareIndex::refs(
   return false;
 }
 
+bool ProjectAwareIndex::containedRefs(
+    const ContainedRefsRequest &Req,
+    llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
+  trace::Span Tracer("ProjectAwareIndex::refersTo");
+  if (auto *Idx = getIndex())
+    return Idx->containedRefs(Req, Callback);
+  return false;
+}
+
 bool ProjectAwareIndex::fuzzyFind(
     const FuzzyFindRequest &Req,
     llvm::function_ref<void(const Symbol &)> Callback) const {
diff --git a/clang-tools-extra/clangd/index/Ref.h b/clang-tools-extra/clangd/index/Ref.h
index 6e383e2ade3d2..870f77f56e6cb 100644
--- a/clang-tools-extra/clangd/index/Ref.h
+++ b/clang-tools-extra/clangd/index/Ref.h
@@ -63,6 +63,9 @@ enum class RefKind : uint8_t {
   //   ^ this references Foo, but does not explicitly spell out its name
   // };
   Spelled = 1 << 3,
+  // A reference which is a call. Used as a filter for which references
+  // to store in data structures used for computing outgoing calls.
+  Call = 1 << 4,
   All = Declaration | Definition | Reference | Spelled,
 };
 
diff --git a/clang-tools-extra/clangd/index/Serialization.cpp b/clang-tools-extra/clangd/index/Serialization.cpp
index 72a4e8b007668..f03839599612c 100644
--- a/clang-tools-extra/clangd/index/Serialization.cpp
+++ b/clang-tools-extra/clangd/index/Serialization.cpp
@@ -457,7 +457,7 @@ readCompileCommand(Reader CmdReader, llvm::ArrayRef<llvm::StringRef> Strings) {
 // The current versioning scheme is simple - non-current versions are rejected.
 // If you make a breaking change, bump this version number to invalidate stored
 // data. Later we may want to support some backward compatibility.
-constexpr static uint32_t Version = 19;
+constexpr static uint32_t Version = 20;
 
 llvm::Expected<IndexFileIn> readRIFF(llvm::StringRef Data,
                                      SymbolOrigin Origin) {
@@ -704,7 +704,8 @@ llvm::Expected<IndexFileIn> readIndexFile(llvm::StringRef Data,
 }
 
 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
-                                       SymbolOrigin Origin, bool UseDex) {
+                                       SymbolOrigin Origin, bool UseDex,
+                                       bool SupportContainedRefs) {
   trace::Span OverallTracer("LoadIndex");
   auto Buffer = llvm::MemoryBuffer::getFile(SymbolFilename);
   if (!Buffer) {
@@ -735,10 +736,11 @@ std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef SymbolFilename,
   size_t NumRelations = Relations.size();
 
   trace::Span Tracer("BuildIndex");
-  auto Index = UseDex ? dex::Dex::build(std::move(Symbols), std::move(Refs),
-                                        std::move(Relations))
-                      : MemIndex::build(std::move(Symbols), std::move(Refs),
-                                        std::move(Relations));
+  auto Index = UseDex
+                   ? dex::Dex::build(std::move(Symbols), std::move(Refs),
+                                     std::move(Relations), SupportContainedRefs)
+                   : MemIndex::build(std::move(Symbols), std::move(Refs),
+                                     std::move(Relations));
   vlog("Loaded {0} from {1} with estimated memory usage {2} bytes\n"
        "  - number of symbols: {3}\n"
        "  - number of refs: {4}\n"
diff --git a/clang-tools-extra/clangd/index/Serialization.h b/clang-tools-extra/clangd/index/Serialization.h
index b6890d63d2c38..bf8e036afcb6c 100644
--- a/clang-tools-extra/clangd/index/Serialization.h
+++ b/clang-tools-extra/clangd/index/Serialization.h
@@ -83,7 +83,8 @@ std::string toYAML(const Ref &);
 // Build an in-memory static index from an index file.
 // The size should be relatively small, so data can be managed in memory.
 std::unique_ptr<SymbolIndex> loadIndex(llvm::StringRef Filename,
-                                       SymbolOrigin Origin, bool UseDex = true);
+                                       SymbolOrigin Origin, bool UseDex,
+                                       bool SupportContainedRefs);
 
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 91ae9d3003a97..81125dbb1aeaf 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -18,6 +18,7 @@
 #include "clang-include-cleaner/Record.h"
 #include "clang-include-cleaner/Types.h"
 #include "index/CanonicalIncludes.h"
+#include "index/Ref.h"
 #include "index/Relation.h"
 #include "index/Symbol.h"
 #include "index/SymbolID.h"
@@ -660,7 +661,7 @@ bool SymbolCollector::handleDeclOccurrence(
     auto FileLoc = SM.getFileLoc(Loc);
     auto FID = SM.getFileID(FileLoc);
     if (Opts.RefsInHeaders || FID == SM.getMainFileID()) {
-      addRef(ID, SymbolRef{FileLoc, FID, Roles,
+      addRef(ID, SymbolRef{FileLoc, FID, Roles, index::getSymbolInfo(ND).Kind,
                            getRefContainer(ASTNode.Parent, Opts),
                            isSpelled(FileLoc, *ND)});
     }
@@ -774,8 +775,10 @@ bool SymbolCollector::handleMacroOccurrence(const IdentifierInfo *Name,
     // FIXME: Populate container information for macro references.
     // FIXME: All MacroRefs are marked as Spelled now, but this should be
     // checked.
-    addRef(ID, SymbolRef{Loc, SM.getFileID(Loc), Roles, /*Container=*/nullptr,
-                         /*Spelled=*/true});
+    addRef(ID,
+           SymbolRef{Loc, SM.getFileID(Loc), Roles, index::SymbolKind::Macro,
+                     /*Container=*/nullptr,
+                     /*Spelled=*/true});
   }
 
   // Collect symbols.
@@ -1166,6 +1169,14 @@ bool SymbolCollector::shouldIndexFile(FileID FID) {
   return I.first->second;
 }
 
+static bool refIsCall(index::SymbolKind Kind) {
+  using SK = index::SymbolKind;
+  return Kind == SK::Function || Kind == SK::InstanceMethod ||
+         Kind == SK::ClassMethod || Kind == SK::StaticMethod ||
+         Kind == SK::Constructor || Kind == SK::Destructor ||
+         Kind == SK::ConversionFunction;
+}
+
 void SymbolCollector::addRef(SymbolID ID, const SymbolRef &SR) {
   const auto &SM = ASTCtx->getSourceManager();
   // FIXME: use the result to filter out references.
@@ -1177,6 +1188,9 @@ void SymbolCollector::addRef(SymbolID ID, const SymbolRef &SR) {
     R.Location.End = Range.second;
     R.Location.FileURI = HeaderFileURIs->toURI(*FE).c_str();
     R.Kind = toRefKind(SR.Roles, SR.Spelled);
+    if (refIsCall(SR.Kind)) {
+      R.Kind |= RefKind::Call;
+    }
     R.Container = getSymbolIDCached(SR.Container);
     Refs.insert(ID, R);
   }
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.h b/clang-tools-extra/clangd/index/SymbolCollector.h
index 6ff7a0145ff87..e9eb27fd0f664 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.h
+++ b/clang-tools-extra/clangd/index/SymbolCollector.h
@@ -209,6 +209,7 @@ class SymbolCollector : public index::IndexDataConsumer {
     SourceLocation Loc;
     FileID FID;
     index::SymbolRoleSet Roles;
+    index::SymbolKind Kind;
     const Decl *Container;
     bool Spelled;
   };
diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp
index b7d3063e19b49..5643ba0c5e4ce 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.cpp
+++ b/clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -33,13 +33,14 @@ namespace clangd {
 namespace dex {
 
 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
-                                        RelationSlab Rels) {
+                                        RelationSlab Rels,
+                                        bool SupportContainedRefs) {
   auto Size = Symbols.bytes() + Refs.bytes();
   // There is no need to include "Rels" in Data because the relations are self-
   // contained, without references into a backing store.
   auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
   return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
-                                Size);
+                               Size, SupportContainedRefs);
 }
 
 namespace {
@@ -120,7 +121,7 @@ class IndexBuilder {
 
 } // namespace
 
-void Dex::buildIndex() {
+void Dex::buildIndex(bool SupportContainedRefs) {
   this->Corpus = dex::Corpus(Symbols.size());
   std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
 
@@ -147,6 +148,20 @@ void Dex::buildIndex() {
   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
     Builder.add(*Symbols[SymbolRank], SymbolRank);
   InvertedIndex = std::move(Builder).build();
+
+  // If the containedRefs() operation is supported, build the RevRefs
+  // data structure used to implement it.
+  if (!SupportContainedRefs)
+    return;
+  for (const auto &[ID, RefList] : Refs)
+    for (const auto &R : RefList)
+      if ((R.Kind & ContainedRefsRequest::SupportedRefKinds) !=
+          RefKind::Unknown)
+        RevRefs.emplace_back(R, ID);
+  // Sort by container ID so we can use binary search for lookup.
+  llvm::sort(RevRefs, [](const RevRef &A, const RevRef &B) {
+    return A.ref().Container < B.ref().Container;
+  });
 }
 
 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
@@ -314,6 +329,36 @@ bool Dex::refs(const RefsRequest &Req,
   return false; // We reported all refs.
 }
 
+llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator>
+Dex::lookupRevRefs(const SymbolID &Container) const {
+  // equal_range() requires an element of the same type as the elements of the
+  // range, so construct a dummy RevRef with the container of interest.
+  Ref QueryRef;
+  QueryRef.Container = Container;
+  RevRef Query(QueryRef, SymbolID{});
+
+  auto ItPair = std::equal_range(RevRefs.cbegin(), RevRefs.cend(), Query,
+                                 [](const RevRef &A, const RevRef &B) {
+                                   return A.ref().Container < B.ref().Container;
+                                 });
+  return {ItPair.first, ItPair.second};
+}
+
+bool Dex::containedRefs(
+    const ContainedRefsRequest &Req,
+    llvm::function_ref<void(const ContainedRefsResult &)> Callback) const {
+  trace::Span Tracer("Dex reversed refs");
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  for (const auto &Rev : lookupRevRefs(Req.ID)) {
+    // RevRefs are already filtered to ContainedRefsRequest::SupportedRefKinds
+    if (Remaining == 0)
+      return true; // More refs were available.
+    --Remaining;
+    Callback(Rev.containedRefsResult());
+  }
+  return false; // We reported all refs.
+}
+
 void Dex::relations(
     const RelationsRequest &Req,
     llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
@@ -350,6 +395,7 @@ size_t Dex::estimateMemoryUsage() const {
   for (const auto &TokenToPostingList : InvertedIndex)
     Bytes += TokenToPostingList.second.bytes();
   Bytes += Refs.getMemorySize();
+  Bytes += RevRefs.size() * sizeof(RevRef);
   Bytes += Relations.getMemorySize();
   return Bytes + BackingDataSize;
 }
diff --git a/clang-tools-extra/clangd/index/dex/Dex.h b/clang-tools-extra/clangd/index/dex/Dex.h
index 69e161d51135b..20c0503d19b97 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.h
+++ b/clang-tools-extra/clangd/index/dex/Dex.h
@@ -36,7 +36,8 @@ class Dex : public SymbolIndex {
 public:
   // All data must outlive this index.
   template <typename SymbolRange, typename RefsRange, typename RelationsRange>
-  Dex(SymbolRange &&Symbols, RefsRange &&Refs, RelationsRange &&Relations)
+  Dex(SymbolRange &&Symbols, RefsRange &&Refs, RelationsRange &&Relations,
+      bool SupportContainedRefs)
       : Corpus(0) {
     for (auto &&Sym : Symbols)
       this->Symbols.push_back(&Sym);
@@ -46,15 +47,15 @@ class Dex : public SymbolIndex {
       this->Relations[std::make_pair(Rel.Subject,
                                      static_cast<uint8_t>(Rel.Predicate))]
           .push_back(Rel.Object);
-    buildIndex();
+    buildIndex(SupportContainedRefs);
   }
   // Symbols and Refs are owned by BackingData, Index takes ownership.
   template <typename SymbolRange, typename RefsRange, typename RelationsRange,
             typename Payload>
   Dex(SymbolRange &&Symbols, RefsRange &&Refs, RelationsRange &&Relations,
-      Payload &&BackingData, size_t BackingDataSize)
+      Payload &&BackingData, size_t BackingDataSize, bool SupportContainedRefs)
       : Dex(std::forward<SymbolRange>(Symbols), std::forward<RefsRange>(Refs),
-            std::forward<RelationsRange>(Relations)) {
+            std::forward<RelationsRange>(Relations), SupportContainedRefs) {
     KeepAlive = std::shared_ptr<void>(
         std::make_shared<Payload>(std::move(BackingData)), nullptr);
     this->BackingDataSize = BackingDataSize;
@@ -64,16 +65,18 @@ class Dex : public SymbolIndex {
             typename FileRange, typename Payload>
   Dex(SymbolRange &&Symbols, RefsRange &&Refs, RelationsRange &&Relations,
       FileRange &&Files, IndexContents IdxContents, Payload &&BackingData,
-      size_t BackingDataSize)
+      size_t BackingDataSize, bool SupportContainedRefs)
       : Dex(std::forward<SymbolRange>(Symbols), std::forward<RefsRange>(Refs),
             std::forward<RelationsRange>(Relations),
-            std::forward<Payload>(BackingData), BackingDataSize) {
+            std::forward<Payload>(BackingData), BackingDataSize,
+            SupportContainedRefs) {
     this->Files = std::forward<FileRange>(Files);
     this->IdxContents = IdxContents;
   }
 
   /// Builds an index from slabs. The index takes ownership of the slab.
-  static std::unique_ptr<SymbolIndex> build(SymbolSlab, RefSlab, RelationSlab);
+  static std::unique_ptr<SymbolIndex> build(SymbolSlab, RefSlab, RelationSlab,
+                                            bool SupportContainedRefs);
 
   bool
   fuzzyFind(const FuzzyFindRequest &Req,
@@ -85,6 +88,10 @@ class Dex : public SymbolIndex {
   bool refs(const RefsRequest &Req,
             llvm::function_ref<void(const Ref &)> Callback) const override;
 
+  bool containedRefs(const ContainedRefsRequest &Req,
+                     llvm::function_ref<void(const ContainedRefsResult &)>
+                         Callback) const override;
+
   void relations(const RelationsRequest &Req,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>
                      Callback) const override;
@@ -95,7 +102,22 @@ class Dex : public SymbolIndex {
   size_t estimateMemoryUsage() const override;
 
 private:
-  void buildIndex();
+  class RevRef {
+    const Ref *Reference;
+    SymbolID Target;
+
+  public:
+    RevRef(const Ref &Reference, SymbolID Target)
+        : Reference(&Reference), Target(Target) {}
+    const Ref &ref() const { return *Reference; }
+    ContainedRefsResult containedRefsResult() const {
+      return {ref().Location, ref().Kind, Target};
+    }
+  };
+
+  void buildIndex(bool EnableOutgoingCalls);
+  llvm::iterator_range<std::vector<RevRef>::const_iterator>
+  lookupRevRefs(const SymbolID &Container) const;
   std::unique_ptr<Iterator> iterator(const Token &Tok) const;
   std::unique_ptr<Iterator>
   createFileProximityIterator(llvm::ArrayRef<std::string> ProximityPaths) const;
@@ -116,6 +138,7 @@ class Dex : public SymbolIndex {
   llvm::DenseMap<Token, PostingList> InvertedIndex;
   dex::Corpus Corpus;
   llvm::DenseMap<SymbolID, llvm::ArrayRef<Ref>> Refs;
+  std::vector<RevRef> RevRefs; // sorted by container ID
   static_assert(sizeof(RelationKind) == sizeof(uint8_t),
                 "RelationKind should be of same size as a uint8_t");
   llvm::DenseMap<std::pair<SymbolID, uint8_t>, std::vector<SymbolID>> Relations;
diff --git a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp
index cea59ae409914..f185808ae1544 100644
--- a/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp
+++ b/clang-tools-extra/clangd/index/dex/dexp/Dexp.cpp
@@ -375,7 +375,8 @@ std::unique_ptr<SymbolIndex> openIndex(llvm::StringRef Index) {
   return Index.starts_with("remote:")
              ? remote::getClient(Index.drop_front(strlen("remote:")),
                                  ProjectRoot)
-             : loadIndex(Index, SymbolOrigin::Static, /*UseDex=*/true);
+             : loadIndex(Index, SymbolOrigin::Static, /*UseDex=*/true,
+                         /*SupportContainedRefs=*/true);
 }
 
 bool runCommand(std::string Request, const SymbolIndex &Index) {
diff --git a/clang-tools-extra/clangd/index/remote/Client.cpp b/clang-tools-extra/clangd/index/remote/Client.cpp
index 391da3916259c..79b827126b4ef 100644
--- a/clang-tools-extra/clangd/index/remote/Client.cpp
+++ b/clang-tools-extra/clangd/index/remote/Client.cpp
@@ -146,6 +146,13 @@ class IndexClient : public clangd::SymbolIndex {
     return streamRPC(Request, &remote::v1::SymbolIndex::Stub::Refs, Callback);
   }
 
+  bool containedRefs(const clangd::ContainedRefsRequest &Request,
+                     llvm::function_ref<void(const ContainedRefsResult &)>
+                         Callback) const override {
+    return streamRPC(Request, &remote::v1::SymbolIndex::Stub::ContainedRefs,
+                     Callback);
+  }
+
   void
   relations(const clangd::RelationsRequest &Request,
             llvm::function_ref<void(const SymbolID &, const clangd::Symbol &)>
diff --git a/clang-tools-extra/clangd/index/remote/Index.proto b/clang-tools-extra/clangd/index/remote/Index.proto
index 3072299d8f345..689ef9d44ee40 100644
--- a/clang-tools-extra/clangd/index/remote/Index.proto
+++ b/clang-tools-extra/clangd/index/remote/Index.proto
@@ -131,3 +131,21 @@ message Relation {
   optional string subject_id = 1;
   optional Symbol object = 2;
 }
+
+message ContainedRefsRequest {
+  required string id = 1;
+  optional uint32 limit = 2;
+}
+
+message ContainedRefsReply {
+  oneof kind {
+    ContainedRef stream_result = 1;
+    FinalResult final_result = 2;
+  }
+}
+
+message ContainedRef {
+  required SymbolLocation location = 1;
+  required uint32 kind = 2;
+  required string symbol = 3;
+}
diff --git a/clang-tools-extra/clangd/index/remote/Service.proto b/clang-tools-extra/clangd/index/remote/Service.proto
index 7c7efa530200d..43023321cb9e1 100644
--- a/clang-tools-extra/clangd/index/remote/Service.proto
+++ b/clang-tools-extra/clangd/index/remote/Service.proto
@@ -21,5 +21,7 @@ service SymbolIndex {
 
   rpc Refs(RefsRequest) returns (stream RefsReply) {}
 
+  rpc ContainedRefs(ContainedRefsRequest) returns (stream ContainedRefsReply) {}
+
   rpc Relations(RelationsRequest) returns (stream RelationsReply) {}
 }
diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp
index 7e31ada18a657..a80d12347d48d 100644
--- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp
+++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.cpp
@@ -126,6 +126,18 @@ Marshaller::fromProtobuf(const RefsRequest *Message) {
   return Req;
 }
 
+llvm::Expected<clangd::ContainedRefsRequest>
+Marshaller::fromProtobuf(const ContainedRefsRequest *Message) {
+  clangd::ContainedRefsRequest Req;
+  auto ID = SymbolID::fromStr(Message->id());
+  if (!ID)
+    return ID.takeError();
+  Req.ID = *ID;
+  if (Message->has_limit())
+    Req.Limit = Message->limit();
+  return Req;
+}
+
 llvm::Expected<clangd::RelationsRequest>
 Marshaller::fromProtobuf(const RelationsRequest *Message) {
   clangd::RelationsRequest Req;
@@ -192,6 +204,21 @@ llvm::Expected<clangd::Ref> Marshaller::fromProtobuf(const Ref &Message) {
   return Result;
 }
 
+llvm::Expected<clangd::ContainedRefsResult>
+Marshaller::fromProtobuf(const ContainedRef &Message) {
+  clangd::ContainedRefsResult Result;
+  auto Location = fromProtobuf(Message.location());
+  if (!Location)
+    return Location.takeError();
+  Result.Location = *Location;
+  Result.Kind = static_cast<RefKind>(Message.kind());
+  auto Symbol = SymbolID::fromStr(Message.symbol());
+  if (!Symbol)
+    return Symbol.takeError();
+  Result.Symbol = *Symbol;
+  return Result;
+}
+
 llvm::Expected<std::pair<clangd::SymbolID, clangd::Symbol>>
 Marshaller::fromProtobuf(const Relation &Message) {
   auto SubjectID = SymbolID::fromStr(Message.subject_id());
@@ -244,6 +271,15 @@ RefsRequest Marshaller::toProtobuf(const clangd::RefsRequest &From) {
   return RPCRequest;
 }
 
+ContainedRefsRequest
+Marshaller::toProtobuf(const clangd::ContainedRefsRequest &From) {
+  ContainedRefsRequest RPCRequest;
+  RPCRequest.set_id(From.ID.str());
+  if (From.Limit)
+    RPCRequest.set_limit(*From.Limit);
+  return RPCRequest;
+}
+
 RelationsRequest Marshaller::toProtobuf(const clangd::RelationsRequest &From) {
   RelationsRequest RPCRequest;
   for (const auto &ID : From.Subjects)
@@ -299,6 +335,18 @@ llvm::Expected<Ref> Marshaller::toProtobuf(const clangd::Ref &From) {
   return Result;
 }
 
+llvm::Expected<ContainedRef>
+Marshaller::toProtobuf(const clangd::ContainedRefsResult &From) {
+  ContainedRef Result;
+  auto Location = toProtobuf(From.Location);
+  if (!Location)
+    return Location.takeError();
+  *Result.mutable_location() = *Location;
+  Result.set_kind(static_cast<uint32_t>(From.Kind));
+  *Result.mutable_symbol() = From.Symbol.str();
+  return Result;
+}
+
 llvm::Expected<Relation> Marshaller::toProtobuf(const clangd::SymbolID &Subject,
                                                 const clangd::Symbol &Object) {
   Relation Result;
diff --git a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h
index e827b4c155a20..5bee9205aef58 100644
--- a/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h
+++ b/clang-tools-extra/clangd/index/remote/marshalling/Marshalling.h
@@ -40,6 +40,8 @@ class Marshaller {
 
   llvm::Expected<clangd::Symbol> fromProtobuf(const Symbol &Message);
   llvm::Expected<clangd::Ref> fromProtobuf(const Ref &Message);
+  llvm::Expected<clangd::ContainedRefsResult>
+  fromProtobuf(const ContainedRef &Message);
   llvm::Expected<std::pair<clangd::SymbolID, clangd::Symbol>>
   fromProtobuf(const Relation &Message);
 
@@ -48,6 +50,8 @@ class Marshaller {
   llvm::Expected<clangd::FuzzyFindRequest>
   fromProtobuf(const FuzzyFindRequest *Message);
   llvm::Expected<clangd::RefsRequest> fromProtobuf(const RefsRequest *Message);
+  llvm::Expected<clangd::ContainedRefsRequest>
+  fromProtobuf(const ContainedRefsRequest *Message);
   llvm::Expected<clangd::RelationsRequest>
   fromProtobuf(const RelationsRequest *Message);
 
@@ -58,10 +62,13 @@ class Marshaller {
   LookupRequest toProtobuf(const clangd::LookupRequest &From);
   FuzzyFindRequest toProtobuf(const clangd::FuzzyFindRequest &From);
   RefsRequest toProtobuf(const clangd::RefsRequest &From);
+  ContainedRefsRequest toProtobuf(const clangd::ContainedRefsRequest &From);
   RelationsRequest toProtobuf(const clangd::RelationsRequest &From);
 
   llvm::Expected<Symbol> toProtobuf(const clangd::Symbol &From);
   llvm::Expected<Ref> toProtobuf(const clangd::Ref &From);
+  llvm::Expected<ContainedRef>
+  toProtobuf(const clangd::ContainedRefsResult &From);
   llvm::Expected<Relation> toProtobuf(const clangd::SymbolID &Subject,
                                       const clangd::Symbol &Object);
 
diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp
index 52fca53260a16..890b6c27ed928 100644
--- a/clang-tools-extra/clangd/index/remote/server/Server.cpp
+++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp
@@ -258,6 +258,53 @@ class RemoteIndexServer final : public v1::SymbolIndex::Service {
     return grpc::Status::OK;
   }
 
+  grpc::Status
+  ContainedRefs(grpc::ServerContext *Context,
+                const ContainedRefsRequest *Request,
+                grpc::ServerWriter<ContainedRefsReply> *Reply) override {
+    auto StartTime = stopwatch::now();
+    WithContextValue WithRequestContext(CurrentRequest, Context);
+    logRequest(*Request);
+    trace::Span Tracer("ContainedRefsRequest");
+    auto Req = ProtobufMarshaller->fromProtobuf(Request);
+    if (!Req) {
+      elog("Can not parse ContainedRefsRequest from protobuf: {0}",
+           Req.takeError());
+      return grpc::Status::CANCELLED;
+    }
+    if (!Req->Limit || *Req->Limit > LimitResults) {
+      log("[public] Limiting result size for ContainedRefs request from {0} to "
+          "{1}.",
+          Req->Limit, LimitResults);
+      Req->Limit = LimitResults;
+    }
+    unsigned Sent = 0;
+    unsigned FailedToSend = 0;
+    bool HasMore =
+        Index.containedRefs(*Req, [&](const clangd::ContainedRefsResult &Item) {
+          auto SerializedItem = ProtobufMarshaller->toProtobuf(Item);
+          if (!SerializedItem) {
+            elog("Unable to convert ContainedRefsResult to protobuf: {0}",
+                 SerializedItem.takeError());
+            ++FailedToSend;
+            return;
+          }
+          ContainedRefsReply NextMessage;
+          *NextMessage.mutable_stream_result() = *SerializedItem;
+          logResponse(NextMessage);
+          Reply->Write(NextMessage);
+          ++Sent;
+        });
+    ContainedRefsReply LastMessage;
+    LastMessage.mutable_final_result()->set_has_more(HasMore);
+    logResponse(LastMessage);
+    Reply->Write(LastMessage);
+    SPAN_ATTACH(Tracer, "Sent", Sent);
+    SPAN_ATTACH(Tracer, "Failed to send", FailedToSend);
+    logRequestSummary("v1/ContainedRefs", Sent, StartTime);
+    return grpc::Status::OK;
+  }
+
   grpc::Status Relations(grpc::ServerContext *Context,
                          const RelationsRequest *Request,
                          grpc::ServerWriter<RelationsReply> *Reply) override {
@@ -396,7 +443,8 @@ void hotReload(clangd::SwapIndex &Index, llvm::StringRef IndexPath,
        LastStatus.getLastModificationTime(), Status->getLastModificationTime());
   LastStatus = *Status;
   std::unique_ptr<clang::clangd::SymbolIndex> NewIndex =
-      loadIndex(IndexPath, SymbolOrigin::Static);
+      loadIndex(IndexPath, SymbolOrigin::Static, /*UseDex=*/true,
+                /*SupportContainedRefs=*/true);
   if (!NewIndex) {
     elog("Failed to load new index. Old index will be served.");
     return;
@@ -532,8 +580,9 @@ int main(int argc, char *argv[]) {
     return Status.getError().value();
   }
 
-  auto SymIndex =
-      clang::clangd::loadIndex(IndexPath, clang::clangd::SymbolOrigin::Static);
+  auto SymIndex = clang::clangd::loadIndex(
+      IndexPath, clang::clangd::SymbolOrigin::Static, /*UseDex=*/true,
+      /*SupportContainedRefs=*/true);
   if (!SymIndex) {
     llvm::errs() << "Failed to open the index.\n";
     return -1;
diff --git a/clang-tools-extra/clangd/test/index-serialization/Inputs/sample.idx b/clang-tools-extra/clangd/test/index-serialization/Inputs/sample.idx
index 0c04df86ae1c6..6368e7145b1e4 100644
Binary files a/clang-tools-extra/clangd/test/index-serialization/Inputs/sample.idx and b/clang-tools-extra/clangd/test/index-serialization/Inputs/sample.idx differ
diff --git a/clang-tools-extra/clangd/test/type-hierarchy-ext.test b/clang-tools-extra/clangd/test/type-hierarchy-ext.test
index ddb9a014be0c7..8d1a5dc31da0f 100644
--- a/clang-tools-extra/clangd/test/type-hierarchy-ext.test
+++ b/clang-tools-extra/clangd/test/type-hierarchy-ext.test
@@ -12,6 +12,7 @@
 # CHECK-NEXT:        "data": {
 # CHECK-NEXT:           "symbolID": "A6576FE083F2949A"
 # CHECK-NEXT:        },
+# CHECK-NEXT:        "detail": "Child3",
 # CHECK-NEXT:        "kind": 23,
 # CHECK-NEXT:        "name": "Child3",
 # CHECK-NEXT:        "range": {
@@ -153,6 +154,7 @@
 # CHECK-NEXT:        "data": {
 # CHECK-NEXT:          "symbolID": "5705B382DFC77CBC"
 # CHECK-NEXT:        },
+# CHECK-NEXT:        "detail": "Child4",
 # CHECK-NEXT:        "kind": 23,
 # CHECK-NEXT:        "name": "Child4",
 # CHECK-NEXT:        "range": {
diff --git a/clang-tools-extra/clangd/test/type-hierarchy.test b/clang-tools-extra/clangd/test/type-hierarchy.test
index 69751000a7c6c..a5f13ab13d0b3 100644
--- a/clang-tools-extra/clangd/test/type-hierarchy.test
+++ b/clang-tools-extra/clangd/test/type-hierarchy.test
@@ -62,6 +62,7 @@
 # CHECK-NEXT:         ],
 # CHECK-NEXT:         "symbolID": "ECDC0C46D75120F4"
 # CHECK-NEXT:       },
+# CHECK-NEXT:       "detail": "Child1",
 # CHECK-NEXT:       "kind": 23,
 # CHECK-NEXT:       "name": "Child1",
 # CHECK-NEXT:       "range": {
@@ -112,6 +113,7 @@
 # CHECK-NEXT:        ],
 # CHECK-NEXT:        "symbolID": "A6576FE083F2949A"
 # CHECK-NEXT:       },
+# CHECK-NEXT:       "detail": "Child3",
 # CHECK-NEXT:       "kind": 23,
 # CHECK-NEXT:       "name": "Child3",
 # CHECK-NEXT:       "range": {
diff --git a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
index bc2eaa77a66ee..df8d075e80596 100644
--- a/clang-tools-extra/clangd/tool/Check.cpp
+++ b/clang-tools-extra/clangd/tool/Check.cpp
@@ -163,7 +163,7 @@ class Checker {
   unsigned ErrCount = 0;
 
   Checker(llvm::StringRef File, const ClangdLSPServer::Options &Opts)
-      : File(File), Opts(Opts) {}
+      : File(File), Opts(Opts), Index(/*SupportContainedRefs=*/true) {}
 
   // Read compilation database and choose a compile command for the file.
   bool buildCommand(const ThreadsafeFS &TFS) {
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index cc061e2d93231..80a0653f8f740 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -604,7 +604,7 @@ const char TestScheme::TestDir[] = "/clangd-test";
 
 std::unique_ptr<SymbolIndex>
 loadExternalIndex(const Config::ExternalIndexSpec &External,
-                  AsyncTaskRunner *Tasks) {
+                  AsyncTaskRunner *Tasks, bool SupportContainedRefs) {
   static const trace::Metric RemoteIndexUsed("used_remote_index",
                                              trace::Metric::Value, "address");
   switch (External.Kind) {
@@ -620,8 +620,9 @@ loadExternalIndex(const Config::ExternalIndexSpec &External,
         External.Location);
     auto NewIndex = std::make_unique<SwapIndex>(std::make_unique<MemIndex>());
     auto IndexLoadTask = [File = External.Location,
-                          PlaceHolder = NewIndex.get()] {
-      if (auto Idx = loadIndex(File, SymbolOrigin::Static, /*UseDex=*/true))
+                          PlaceHolder = NewIndex.get(), SupportContainedRefs] {
+      if (auto Idx = loadIndex(File, SymbolOrigin::Static, /*UseDex=*/true,
+                               SupportContainedRefs))
         PlaceHolder->reset(std::move(Idx));
     };
     if (Tasks) {
@@ -909,7 +910,12 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   Opts.BackgroundIndexPriority = BackgroundIndexPriority;
   Opts.ReferencesLimit = ReferencesLimit;
   Opts.Rename.LimitFiles = RenameFileLimit;
-  auto PAI = createProjectAwareIndex(loadExternalIndex, Sync);
+  auto PAI = createProjectAwareIndex(
+      [SupportContainedRefs = Opts.EnableOutgoingCalls](
+          const Config::ExternalIndexSpec &External, AsyncTaskRunner *Tasks) {
+        return loadExternalIndex(External, Tasks, SupportContainedRefs);
+      },
+      Sync);
   Opts.StaticIndex = PAI.get();
   Opts.AsyncThreadsCount = WorkerThreadsCount;
   Opts.MemoryCleanup = getMemoryCleanupFunction();
diff --git a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
index e51942462fbdf..ada14c9939318 100644
--- a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
@@ -685,7 +685,8 @@ TEST_F(BackgroundIndexTest, Reindex) {
 class BackgroundIndexRebuilderTest : public testing::Test {
 protected:
   BackgroundIndexRebuilderTest()
-      : Source(IndexContents::All), Target(std::make_unique<MemIndex>()),
+      : Source(IndexContents::All, /*SupportContainedRefs=*/true),
+        Target(std::make_unique<MemIndex>()),
         Rebuilder(&Target, &Source, /*Threads=*/10) {
     // Prepare FileSymbols with TestSymbol in it, for checkRebuild.
     TestSymbol.ID = SymbolID("foo");
diff --git a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
index 8821d3aad9c78..316b94305c9ae 100644
--- a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
@@ -44,17 +44,27 @@ using ::testing::UnorderedElementsAre;
 
 // Helpers for matching call hierarchy data structures.
 MATCHER_P(withName, N, "") { return arg.name == N; }
+MATCHER_P(withDetail, N, "") { return arg.detail == N; }
 MATCHER_P(withSelectionRange, R, "") { return arg.selectionRange == R; }
 
 template <class ItemMatcher>
 ::testing::Matcher<CallHierarchyIncomingCall> from(ItemMatcher M) {
   return Field(&CallHierarchyIncomingCall::from, M);
 }
+template <class ItemMatcher>
+::testing::Matcher<CallHierarchyOutgoingCall> to(ItemMatcher M) {
+  return Field(&CallHierarchyOutgoingCall::to, M);
+}
 template <class... RangeMatchers>
-::testing::Matcher<CallHierarchyIncomingCall> fromRanges(RangeMatchers... M) {
+::testing::Matcher<CallHierarchyIncomingCall> iFromRanges(RangeMatchers... M) {
   return Field(&CallHierarchyIncomingCall::fromRanges,
                UnorderedElementsAre(M...));
 }
+template <class... RangeMatchers>
+::testing::Matcher<CallHierarchyOutgoingCall> oFromRanges(RangeMatchers... M) {
+  return Field(&CallHierarchyOutgoingCall::fromRanges,
+               UnorderedElementsAre(M...));
+}
 
 TEST(CallHierarchy, IncomingOneFileCpp) {
   Annotations Source(R"cpp(
@@ -79,21 +89,24 @@ TEST(CallHierarchy, IncomingOneFileCpp) {
       prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
   ASSERT_THAT(Items, ElementsAre(withName("callee")));
   auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
-  ASSERT_THAT(IncomingLevel1,
-              ElementsAre(AllOf(from(withName("caller1")),
-                                fromRanges(Source.range("Callee")))));
+  ASSERT_THAT(
+      IncomingLevel1,
+      ElementsAre(AllOf(from(AllOf(withName("caller1"), withDetail("caller1"))),
+                        iFromRanges(Source.range("Callee")))));
   auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
-  ASSERT_THAT(IncomingLevel2,
-              ElementsAre(AllOf(from(withName("caller2")),
-                                fromRanges(Source.range("Caller1A"),
-                                           Source.range("Caller1B"))),
-                          AllOf(from(withName("caller3")),
-                                fromRanges(Source.range("Caller1C")))));
+  ASSERT_THAT(
+      IncomingLevel2,
+      ElementsAre(AllOf(from(AllOf(withName("caller2"), withDetail("caller2"))),
+                        iFromRanges(Source.range("Caller1A"),
+                                    Source.range("Caller1B"))),
+                  AllOf(from(AllOf(withName("caller3"), withDetail("caller3"))),
+                        iFromRanges(Source.range("Caller1C")))));
 
   auto IncomingLevel3 = incomingCalls(IncomingLevel2[0].from, Index.get());
-  ASSERT_THAT(IncomingLevel3,
-              ElementsAre(AllOf(from(withName("caller3")),
-                                fromRanges(Source.range("Caller2")))));
+  ASSERT_THAT(
+      IncomingLevel3,
+      ElementsAre(AllOf(from(AllOf(withName("caller3"), withDetail("caller3"))),
+                        iFromRanges(Source.range("Caller2")))));
 
   auto IncomingLevel4 = incomingCalls(IncomingLevel3[0].from, Index.get());
   EXPECT_THAT(IncomingLevel4, IsEmpty());
@@ -125,20 +138,24 @@ TEST(CallHierarchy, IncomingOneFileObjC) {
   ASSERT_THAT(Items, ElementsAre(withName("callee")));
   auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
   ASSERT_THAT(IncomingLevel1,
-              ElementsAre(AllOf(from(withName("caller1")),
-                                fromRanges(Source.range("Callee")))));
+              ElementsAre(AllOf(from(AllOf(withName("caller1"),
+                                           withDetail("MyClass::caller1"))),
+                                iFromRanges(Source.range("Callee")))));
   auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
   ASSERT_THAT(IncomingLevel2,
-              ElementsAre(AllOf(from(withName("caller2")),
-                                fromRanges(Source.range("Caller1A"),
-                                           Source.range("Caller1B"))),
-                          AllOf(from(withName("caller3")),
-                                fromRanges(Source.range("Caller1C")))));
+              ElementsAre(AllOf(from(AllOf(withName("caller2"),
+                                           withDetail("MyClass::caller2"))),
+                                iFromRanges(Source.range("Caller1A"),
+                                            Source.range("Caller1B"))),
+                          AllOf(from(AllOf(withName("caller3"),
+                                           withDetail("MyClass::caller3"))),
+                                iFromRanges(Source.range("Caller1C")))));
 
   auto IncomingLevel3 = incomingCalls(IncomingLevel2[0].from, Index.get());
   ASSERT_THAT(IncomingLevel3,
-              ElementsAre(AllOf(from(withName("caller3")),
-                                fromRanges(Source.range("Caller2")))));
+              ElementsAre(AllOf(from(AllOf(withName("caller3"),
+                                           withDetail("MyClass::caller3"))),
+                                iFromRanges(Source.range("Caller2")))));
 
   auto IncomingLevel4 = incomingCalls(IncomingLevel3[0].from, Index.get());
   EXPECT_THAT(IncomingLevel4, IsEmpty());
@@ -167,14 +184,16 @@ TEST(CallHierarchy, MainFileOnlyRef) {
       prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
   ASSERT_THAT(Items, ElementsAre(withName("callee")));
   auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
-  ASSERT_THAT(IncomingLevel1,
-              ElementsAre(AllOf(from(withName("caller1")),
-                                fromRanges(Source.range("Callee")))));
+  ASSERT_THAT(
+      IncomingLevel1,
+      ElementsAre(AllOf(from(AllOf(withName("caller1"), withDetail("caller1"))),
+                        iFromRanges(Source.range("Callee")))));
 
   auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
-  EXPECT_THAT(IncomingLevel2,
-              ElementsAre(AllOf(from(withName("caller2")),
-                                fromRanges(Source.range("Caller1")))));
+  EXPECT_THAT(
+      IncomingLevel2,
+      ElementsAre(AllOf(from(AllOf(withName("caller2"), withDetail("caller2"))),
+                        iFromRanges(Source.range("Caller1")))));
 }
 
 TEST(CallHierarchy, IncomingQualified) {
@@ -200,14 +219,72 @@ TEST(CallHierarchy, IncomingQualified) {
       prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
   ASSERT_THAT(Items, ElementsAre(withName("Waldo::find")));
   auto Incoming = incomingCalls(Items[0], Index.get());
-  EXPECT_THAT(Incoming,
-              ElementsAre(AllOf(from(withName("caller1")),
-                                fromRanges(Source.range("Caller1"))),
-                          AllOf(from(withName("caller2")),
-                                fromRanges(Source.range("Caller2")))));
+  EXPECT_THAT(
+      Incoming,
+      ElementsAre(
+          AllOf(from(AllOf(withName("caller1"), withDetail("ns::caller1"))),
+                iFromRanges(Source.range("Caller1"))),
+          AllOf(from(AllOf(withName("caller2"), withDetail("ns::caller2"))),
+                iFromRanges(Source.range("Caller2")))));
 }
 
-TEST(CallHierarchy, IncomingMultiFileCpp) {
+TEST(CallHierarchy, OutgoingOneFile) {
+  // Test outgoing call on the main file, with namespaces and methods
+  Annotations Source(R"cpp(
+    void callee(int);
+    namespace ns {
+      struct Foo {
+        void caller1();
+      };
+      void Foo::caller1() {
+        $Callee[[callee]](42);
+      }
+    }
+    namespace {
+      void caller2(ns::Foo& F) {
+        F.$Caller1A[[caller1]]();
+        F.$Caller1B[[caller1]]();
+      }
+    }
+    void call^er3(ns::Foo& F) {
+      F.$Caller1C[[caller1]]();
+      $Caller2[[caller2]](F);
+    }
+  )cpp");
+  TestTU TU = TestTU::withCode(Source.code());
+  auto AST = TU.build();
+  auto Index = TU.index();
+
+  std::vector<CallHierarchyItem> Items =
+      prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
+  ASSERT_THAT(Items, ElementsAre(withName("caller3")));
+  auto OugoingLevel1 = outgoingCalls(Items[0], Index.get());
+  ASSERT_THAT(
+      OugoingLevel1,
+      ElementsAre(
+          AllOf(to(AllOf(withName("caller1"), withDetail("ns::Foo::caller1"))),
+                oFromRanges(Source.range("Caller1C"))),
+          AllOf(to(AllOf(withName("caller2"), withDetail("caller2"))),
+                oFromRanges(Source.range("Caller2")))));
+
+  auto OutgoingLevel2 = outgoingCalls(OugoingLevel1[1].to, Index.get());
+  ASSERT_THAT(
+      OutgoingLevel2,
+      ElementsAre(AllOf(
+          to(AllOf(withName("caller1"), withDetail("ns::Foo::caller1"))),
+          oFromRanges(Source.range("Caller1A"), Source.range("Caller1B")))));
+
+  auto OutgoingLevel3 = outgoingCalls(OutgoingLevel2[0].to, Index.get());
+  ASSERT_THAT(
+      OutgoingLevel3,
+      ElementsAre(AllOf(to(AllOf(withName("callee"), withDetail("callee"))),
+                        oFromRanges(Source.range("Callee")))));
+
+  auto OutgoingLevel4 = outgoingCalls(OutgoingLevel3[0].to, Index.get());
+  EXPECT_THAT(OutgoingLevel4, IsEmpty());
+}
+
+TEST(CallHierarchy, MultiFileCpp) {
   // The test uses a .hh suffix for header files to get clang
   // to parse them in C++ mode. .h files are parsed in C mode
   // by default, which causes problems because e.g. symbol
@@ -221,32 +298,47 @@ TEST(CallHierarchy, IncomingMultiFileCpp) {
     void calle^e(int) {}
   )cpp");
   Annotations Caller1H(R"cpp(
-    void caller1();
+    namespace nsa {
+      void caller1();
+    }
   )cpp");
   Annotations Caller1C(R"cpp(
     #include "callee.hh"
     #include "caller1.hh"
-    void caller1() {
-      [[calle^e]](42);
+    namespace nsa {
+      void caller1() {
+        [[calle^e]](42);
+      }
     }
   )cpp");
   Annotations Caller2H(R"cpp(
-    void caller2();
+    namespace nsb {
+      void caller2();
+    }
   )cpp");
   Annotations Caller2C(R"cpp(
     #include "caller1.hh"
     #include "caller2.hh"
-    void caller2() {
-      $A[[caller1]]();
-      $B[[caller1]]();
+    namespace nsb {
+      void caller2() {
+        nsa::$A[[caller1]]();
+        nsa::$B[[caller1]]();
+      }
+    }
+  )cpp");
+  Annotations Caller3H(R"cpp(
+    namespace nsa {
+      void call^er3();
     }
   )cpp");
   Annotations Caller3C(R"cpp(
     #include "caller1.hh"
     #include "caller2.hh"
-    void caller3() {
-      $Caller1[[caller1]]();
-      $Caller2[[caller2]]();
+    namespace nsa {
+      void call^er3() {
+        $Caller1[[caller1]]();
+        nsb::$Caller2[[caller2]]();
+      }
     }
   )cpp");
 
@@ -254,6 +346,7 @@ TEST(CallHierarchy, IncomingMultiFileCpp) {
   Workspace.addSource("callee.hh", CalleeH.code());
   Workspace.addSource("caller1.hh", Caller1H.code());
   Workspace.addSource("caller2.hh", Caller2H.code());
+  Workspace.addSource("caller3.hh", Caller3H.code());
   Workspace.addMainFile("callee.cc", CalleeC.code());
   Workspace.addMainFile("caller1.cc", Caller1C.code());
   Workspace.addMainFile("caller2.cc", Caller2C.code());
@@ -261,46 +354,84 @@ TEST(CallHierarchy, IncomingMultiFileCpp) {
 
   auto Index = Workspace.index();
 
-  auto CheckCallHierarchy = [&](ParsedAST &AST, Position Pos, PathRef TUPath) {
+  auto CheckIncomingCalls = [&](ParsedAST &AST, Position Pos, PathRef TUPath) {
     std::vector<CallHierarchyItem> Items =
         prepareCallHierarchy(AST, Pos, TUPath);
     ASSERT_THAT(Items, ElementsAre(withName("callee")));
     auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
     ASSERT_THAT(IncomingLevel1,
-                ElementsAre(AllOf(from(withName("caller1")),
-                                  fromRanges(Caller1C.range()))));
+                ElementsAre(AllOf(from(AllOf(withName("caller1"),
+                                             withDetail("nsa::caller1"))),
+                                  iFromRanges(Caller1C.range()))));
 
     auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
     ASSERT_THAT(
         IncomingLevel2,
-        ElementsAre(AllOf(from(withName("caller2")),
-                          fromRanges(Caller2C.range("A"), Caller2C.range("B"))),
-                    AllOf(from(withName("caller3")),
-                          fromRanges(Caller3C.range("Caller1")))));
+        ElementsAre(
+            AllOf(from(AllOf(withName("caller2"), withDetail("nsb::caller2"))),
+                  iFromRanges(Caller2C.range("A"), Caller2C.range("B"))),
+            AllOf(from(AllOf(withName("caller3"), withDetail("nsa::caller3"))),
+                  iFromRanges(Caller3C.range("Caller1")))));
 
     auto IncomingLevel3 = incomingCalls(IncomingLevel2[0].from, Index.get());
     ASSERT_THAT(IncomingLevel3,
-                ElementsAre(AllOf(from(withName("caller3")),
-                                  fromRanges(Caller3C.range("Caller2")))));
+                ElementsAre(AllOf(from(AllOf(withName("caller3"),
+                                             withDetail("nsa::caller3"))),
+                                  iFromRanges(Caller3C.range("Caller2")))));
 
     auto IncomingLevel4 = incomingCalls(IncomingLevel3[0].from, Index.get());
     EXPECT_THAT(IncomingLevel4, IsEmpty());
   };
 
+  auto CheckOutgoingCalls = [&](ParsedAST &AST, Position Pos, PathRef TUPath) {
+    std::vector<CallHierarchyItem> Items =
+        prepareCallHierarchy(AST, Pos, TUPath);
+    ASSERT_THAT(Items, ElementsAre(withName("caller3")));
+    auto OutgoingLevel1 = outgoingCalls(Items[0], Index.get());
+    ASSERT_THAT(
+        OutgoingLevel1,
+        ElementsAre(
+            AllOf(to(AllOf(withName("caller1"), withDetail("nsa::caller1"))),
+                  oFromRanges(Caller3C.range("Caller1"))),
+            AllOf(to(AllOf(withName("caller2"), withDetail("nsb::caller2"))),
+                  oFromRanges(Caller3C.range("Caller2")))));
+
+    auto OutgoingLevel2 = outgoingCalls(OutgoingLevel1[1].to, Index.get());
+    ASSERT_THAT(OutgoingLevel2,
+                ElementsAre(AllOf(
+                    to(AllOf(withName("caller1"), withDetail("nsa::caller1"))),
+                    oFromRanges(Caller2C.range("A"), Caller2C.range("B")))));
+
+    auto OutgoingLevel3 = outgoingCalls(OutgoingLevel2[0].to, Index.get());
+    ASSERT_THAT(
+        OutgoingLevel3,
+        ElementsAre(AllOf(to(AllOf(withName("callee"), withDetail("callee"))),
+                          oFromRanges(Caller1C.range()))));
+
+    auto OutgoingLevel4 = outgoingCalls(OutgoingLevel3[0].to, Index.get());
+    EXPECT_THAT(OutgoingLevel4, IsEmpty());
+  };
+
   // Check that invoking from a call site works.
   auto AST = Workspace.openFile("caller1.cc");
   ASSERT_TRUE(bool(AST));
-  CheckCallHierarchy(*AST, Caller1C.point(), testPath("caller1.cc"));
+  CheckIncomingCalls(*AST, Caller1C.point(), testPath("caller1.cc"));
 
   // Check that invoking from the declaration site works.
   AST = Workspace.openFile("callee.hh");
   ASSERT_TRUE(bool(AST));
-  CheckCallHierarchy(*AST, CalleeH.point(), testPath("callee.hh"));
+  CheckIncomingCalls(*AST, CalleeH.point(), testPath("callee.hh"));
+  AST = Workspace.openFile("caller3.hh");
+  ASSERT_TRUE(bool(AST));
+  CheckOutgoingCalls(*AST, Caller3H.point(), testPath("caller3.hh"));
 
   // Check that invoking from the definition site works.
   AST = Workspace.openFile("callee.cc");
   ASSERT_TRUE(bool(AST));
-  CheckCallHierarchy(*AST, CalleeC.point(), testPath("callee.cc"));
+  CheckIncomingCalls(*AST, CalleeC.point(), testPath("callee.cc"));
+  AST = Workspace.openFile("caller3.cc");
+  ASSERT_TRUE(bool(AST));
+  CheckOutgoingCalls(*AST, Caller3C.point(), testPath("caller3.cc"));
 }
 
 TEST(CallHierarchy, IncomingMultiFileObjC) {
@@ -377,20 +508,20 @@ TEST(CallHierarchy, IncomingMultiFileObjC) {
     auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
     ASSERT_THAT(IncomingLevel1,
                 ElementsAre(AllOf(from(withName("caller1")),
-                                  fromRanges(Caller1C.range()))));
+                                  iFromRanges(Caller1C.range()))));
 
     auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
-    ASSERT_THAT(
-        IncomingLevel2,
-        ElementsAre(AllOf(from(withName("caller2")),
-                          fromRanges(Caller2C.range("A"), Caller2C.range("B"))),
-                    AllOf(from(withName("caller3")),
-                          fromRanges(Caller3C.range("Caller1")))));
+    ASSERT_THAT(IncomingLevel2,
+                ElementsAre(AllOf(from(withName("caller2")),
+                                  iFromRanges(Caller2C.range("A"),
+                                              Caller2C.range("B"))),
+                            AllOf(from(withName("caller3")),
+                                  iFromRanges(Caller3C.range("Caller1")))));
 
     auto IncomingLevel3 = incomingCalls(IncomingLevel2[0].from, Index.get());
     ASSERT_THAT(IncomingLevel3,
                 ElementsAre(AllOf(from(withName("caller3")),
-                                  fromRanges(Caller3C.range("Caller2")))));
+                                  iFromRanges(Caller3C.range("Caller2")))));
 
     auto IncomingLevel4 = incomingCalls(IncomingLevel3[0].from, Index.get());
     EXPECT_THAT(IncomingLevel4, IsEmpty());
@@ -438,12 +569,12 @@ TEST(CallHierarchy, CallInLocalVarDecl) {
   ASSERT_THAT(Items, ElementsAre(withName("callee")));
 
   auto Incoming = incomingCalls(Items[0], Index.get());
-  ASSERT_THAT(
-      Incoming,
-      ElementsAre(
-          AllOf(from(withName("caller1")), fromRanges(Source.range("call1"))),
-          AllOf(from(withName("caller2")), fromRanges(Source.range("call2"))),
-          AllOf(from(withName("caller3")), fromRanges(Source.range("call3")))));
+  ASSERT_THAT(Incoming, ElementsAre(AllOf(from(withName("caller1")),
+                                          iFromRanges(Source.range("call1"))),
+                                    AllOf(from(withName("caller2")),
+                                          iFromRanges(Source.range("call2"))),
+                                    AllOf(from(withName("caller3")),
+                                          iFromRanges(Source.range("call3")))));
 }
 
 TEST(CallHierarchy, HierarchyOnField) {
@@ -467,7 +598,7 @@ TEST(CallHierarchy, HierarchyOnField) {
   auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
   ASSERT_THAT(IncomingLevel1,
               ElementsAre(AllOf(from(withName("caller")),
-                                fromRanges(Source.range("Callee")))));
+                                iFromRanges(Source.range("Callee")))));
 }
 
 TEST(CallHierarchy, HierarchyOnVar) {
@@ -488,7 +619,7 @@ TEST(CallHierarchy, HierarchyOnVar) {
   auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
   ASSERT_THAT(IncomingLevel1,
               ElementsAre(AllOf(from(withName("caller")),
-                                fromRanges(Source.range("Callee")))));
+                                iFromRanges(Source.range("Callee")))));
 }
 
 TEST(CallHierarchy, CallInDifferentFileThanCaller) {
@@ -517,7 +648,7 @@ TEST(CallHierarchy, CallInDifferentFileThanCaller) {
   // header. The protocol does not allow us to represent such calls, so we drop
   // them. (The call hierarchy item itself is kept.)
   EXPECT_THAT(Incoming,
-              ElementsAre(AllOf(from(withName("caller")), fromRanges())));
+              ElementsAre(AllOf(from(withName("caller")), iFromRanges())));
 }
 
 } // namespace
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index a89f499736226..3acacf496e77f 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -1703,6 +1703,12 @@ class IndexRequestCollector : public SymbolIndex {
     return false;
   }
 
+  bool containedRefs(
+      const ContainedRefsRequest &,
+      llvm::function_ref<void(const ContainedRefsResult &)>) const override {
+    return false;
+  }
+
   void relations(const RelationsRequest &,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override {}
diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp
index cafbfd324840c..ca8b81b5cb3c0 100644
--- a/clang-tools-extra/clangd/unittests/DexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -476,7 +476,7 @@ TEST(DexSearchTokens, SymbolPath) {
 
 TEST(Dex, Lookup) {
   auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -489,7 +489,7 @@ TEST(Dex, FuzzyFind) {
   auto Index =
       Dex::build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC",
                                   "ns::nested::ABC", "other::ABC", "other::A"}),
-                 RefSlab(), RelationSlab());
+                 RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "ABC";
   Req.Scopes = {"ns::"};
@@ -511,7 +511,8 @@ TEST(Dex, FuzzyFind) {
 }
 
 TEST(DexTest, DexLimitedNumMatches) {
-  auto I = Dex::build(generateNumSymbols(0, 100), RefSlab(), RelationSlab());
+  auto I =
+      Dex::build(generateNumSymbols(0, 100), RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "5";
   Req.AnyScope = true;
@@ -526,7 +527,7 @@ TEST(DexTest, DexLimitedNumMatches) {
 TEST(DexTest, FuzzyMatch) {
   auto I = Dex::build(
       generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
-      RefSlab(), RelationSlab());
+      RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.AnyScope = true;
@@ -537,7 +538,7 @@ TEST(DexTest, FuzzyMatch) {
 
 TEST(DexTest, ShortQuery) {
   auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.AnyScope = true;
   bool Incomplete;
@@ -580,7 +581,7 @@ TEST(DexTest, ShortQuery) {
 
 TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
   auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.AnyScope = true;
   Req.Query = "y";
@@ -589,7 +590,7 @@ TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
 
 TEST(DexTest, MatchQualifiedNamesWithGlobalScope) {
   auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
@@ -599,7 +600,7 @@ TEST(DexTest, MatchQualifiedNamesWithGlobalScope) {
 TEST(DexTest, MatchQualifiedNamesWithOneScope) {
   auto I =
       Dex::build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}),
-                 RefSlab(), RelationSlab());
+                 RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
@@ -609,7 +610,7 @@ TEST(DexTest, MatchQualifiedNamesWithOneScope) {
 TEST(DexTest, MatchQualifiedNamesWithMultipleScopes) {
   auto I =
       Dex::build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}),
-                 RefSlab(), RelationSlab());
+                 RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::", "b::"};
@@ -618,7 +619,7 @@ TEST(DexTest, MatchQualifiedNamesWithMultipleScopes) {
 
 TEST(DexTest, NoMatchNestedScopes) {
   auto I = Dex::build(generateSymbols({"a::y1", "a::b::y2"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
@@ -627,7 +628,7 @@ TEST(DexTest, NoMatchNestedScopes) {
 
 TEST(DexTest, WildcardScope) {
   auto I = Dex::build(generateSymbols({"a::y1", "a::b::y2", "c::y3"}),
-                      RefSlab(), RelationSlab());
+                      RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.AnyScope = true;
   Req.Query = "y";
@@ -638,7 +639,7 @@ TEST(DexTest, WildcardScope) {
 
 TEST(DexTest, IgnoreCases) {
   auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Query = "AB";
   Req.Scopes = {"ns::"};
@@ -648,7 +649,7 @@ TEST(DexTest, IgnoreCases) {
 TEST(DexTest, UnknownPostingList) {
   // Regression test: we used to ignore unknown scopes and accept any symbol.
   auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.Scopes = {"ns2::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre());
@@ -656,7 +657,7 @@ TEST(DexTest, UnknownPostingList) {
 
 TEST(DexTest, Lookup) {
   auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), RefSlab(),
-                      RelationSlab());
+                      RelationSlab(), true);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -671,7 +672,7 @@ TEST(DexTest, SymbolIndexOptionsFilter) {
   CodeCompletionSymbol.Flags = Symbol::SymbolFlag::IndexedForCodeCompletion;
   NonCodeCompletionSymbol.Flags = Symbol::SymbolFlag::None;
   std::vector<Symbol> Symbols{CodeCompletionSymbol, NonCodeCompletionSymbol};
-  Dex I(Symbols, RefSlab(), RelationSlab());
+  Dex I(Symbols, RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.AnyScope = true;
   Req.RestrictForCodeCompletion = false;
@@ -687,7 +688,7 @@ TEST(DexTest, ProximityPathsBoosting) {
   CloseSymbol.CanonicalDeclaration.FileURI = "unittest:///a/b/c/d/e/f/file.h";
 
   std::vector<Symbol> Symbols{CloseSymbol, RootSymbol};
-  Dex I(Symbols, RefSlab(), RelationSlab());
+  Dex I(Symbols, RefSlab(), RelationSlab(), true);
 
   FuzzyFindRequest Req;
   Req.AnyScope = true;
@@ -726,7 +727,7 @@ TEST(DexTests, Refs) {
   Req.Filter = RefKind::Declaration | RefKind::Definition;
 
   std::vector<std::string> Files;
-  EXPECT_FALSE(Dex(std::vector<Symbol>{Foo, Bar}, Refs, RelationSlab())
+  EXPECT_FALSE(Dex(std::vector<Symbol>{Foo, Bar}, Refs, RelationSlab(), true)
                    .refs(Req, [&](const Ref &R) {
                      Files.push_back(R.Location.FileURI);
                    }));
@@ -734,7 +735,7 @@ TEST(DexTests, Refs) {
 
   Req.Limit = 1;
   Files.clear();
-  EXPECT_TRUE(Dex(std::vector<Symbol>{Foo, Bar}, Refs, RelationSlab())
+  EXPECT_TRUE(Dex(std::vector<Symbol>{Foo, Bar}, Refs, RelationSlab(), true)
                   .refs(Req, [&](const Ref &R) {
                     Files.push_back(R.Location.FileURI);
                   }));
@@ -751,7 +752,7 @@ TEST(DexTests, Relations) {
   std::vector<Relation> Relations{{Parent.ID, RelationKind::BaseOf, Child1.ID},
                                   {Parent.ID, RelationKind::BaseOf, Child2.ID}};
 
-  Dex I{Symbols, RefSlab(), Relations};
+  Dex I{Symbols, RefSlab(), Relations, true};
 
   std::vector<SymbolID> Results;
   RelationsRequest Req;
@@ -770,7 +771,7 @@ TEST(DexIndex, IndexedFiles) {
   auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
   llvm::StringSet<> Files = {"unittest:///foo.cc", "unittest:///bar.cc"};
   Dex I(std::move(Data.first), std::move(Data.second), RelationSlab(),
-        std::move(Files), IndexContents::All, std::move(Data), Size);
+        std::move(Files), IndexContents::All, std::move(Data), Size, true);
   auto ContainsFile = I.indexedFiles();
   EXPECT_EQ(ContainsFile("unittest:///foo.cc"), IndexContents::All);
   EXPECT_EQ(ContainsFile("unittest:///bar.cc"), IndexContents::All);
@@ -784,7 +785,7 @@ TEST(DexTest, PreferredTypesBoosting) {
   Sym2.Type = "T2";
 
   std::vector<Symbol> Symbols{Sym1, Sym2};
-  Dex I(Symbols, RefSlab(), RelationSlab());
+  Dex I(Symbols, RefSlab(), RelationSlab(), true);
 
   FuzzyFindRequest Req;
   Req.AnyScope = true;
@@ -820,7 +821,8 @@ TEST(DexTest, TemplateSpecialization) {
       index::SymbolProperty::TemplatePartialSpecialization);
   B.insert(S);
 
-  auto I = dex::Dex::build(std::move(B).build(), RefSlab(), RelationSlab());
+  auto I =
+      dex::Dex::build(std::move(B).build(), RefSlab(), RelationSlab(), true);
   FuzzyFindRequest Req;
   Req.AnyScope = true;
 
diff --git a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
index 9f713564b2c01..a92142fbcd7c4 100644
--- a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp
@@ -104,7 +104,7 @@ std::unique_ptr<RelationSlab> relSlab(llvm::ArrayRef<const Relation> Rels) {
 }
 
 TEST(FileSymbolsTest, UpdateAndGet) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   EXPECT_THAT(runFuzzyFind(*FS.buildIndex(IndexType::Light), ""), IsEmpty());
 
   FS.update("f1", numSlab(1, 3), refSlab(SymbolID("1"), "f1.cc"), nullptr,
@@ -116,7 +116,7 @@ TEST(FileSymbolsTest, UpdateAndGet) {
 }
 
 TEST(FileSymbolsTest, Overlap) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   FS.update("f1", numSlab(1, 3), nullptr, nullptr, false);
   FS.update("f2", numSlab(3, 5), nullptr, nullptr, false);
   for (auto Type : {IndexType::Light, IndexType::Heavy})
@@ -126,7 +126,7 @@ TEST(FileSymbolsTest, Overlap) {
 }
 
 TEST(FileSymbolsTest, MergeOverlap) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   auto OneSymboSlab = [](Symbol Sym) {
     SymbolSlab::Builder S;
     S.insert(Sym);
@@ -147,7 +147,7 @@ TEST(FileSymbolsTest, MergeOverlap) {
 }
 
 TEST(FileSymbolsTest, SnapshotAliveAfterRemove) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
 
   SymbolID ID("1");
   FS.update("f1", numSlab(1, 3), refSlab(ID, "f1.cc"), nullptr, false);
@@ -180,14 +180,14 @@ void update(FileIndex &M, llvm::StringRef Basename, llvm::StringRef Code) {
 }
 
 TEST(FileIndexTest, CustomizedURIScheme) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f", "class string {};");
 
   EXPECT_THAT(runFuzzyFind(M, ""), ElementsAre(declURI("unittest:///f.h")));
 }
 
 TEST(FileIndexTest, IndexAST) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f1", "namespace ns { void f() {} class X {}; }");
 
   FuzzyFindRequest Req;
@@ -198,7 +198,7 @@ TEST(FileIndexTest, IndexAST) {
 }
 
 TEST(FileIndexTest, NoLocal) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f1", "namespace ns { void f() { int local = 0; } class X {}; }");
 
   EXPECT_THAT(
@@ -207,7 +207,7 @@ TEST(FileIndexTest, NoLocal) {
 }
 
 TEST(FileIndexTest, IndexMultiASTAndDeduplicate) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f1", "namespace ns { void f() {} class X {}; }");
   update(M, "f2", "namespace ns { void ff() {} class X {}; }");
 
@@ -219,7 +219,7 @@ TEST(FileIndexTest, IndexMultiASTAndDeduplicate) {
 }
 
 TEST(FileIndexTest, ClassMembers) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f1", "class X { static int m1; int m2; static void f(); };");
 
   EXPECT_THAT(runFuzzyFind(M, ""),
@@ -228,7 +228,7 @@ TEST(FileIndexTest, ClassMembers) {
 }
 
 TEST(FileIndexTest, IncludeCollected) {
-  FileIndex M;
+  FileIndex M(true);
   update(
       M, "f",
       "// IWYU pragma: private, include <the/good/header.h>\nclass string {};");
@@ -240,7 +240,7 @@ TEST(FileIndexTest, IncludeCollected) {
 }
 
 TEST(FileIndexTest, IWYUPragmaExport) {
-  FileIndex M;
+  FileIndex M(true);
 
   TestTU File;
   File.Code = R"cpp(#pragma once
@@ -286,7 +286,7 @@ template <class Ty, class Arg>
 vector<Ty> make_vector(Arg A) {}
 )cpp";
 
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f", Source);
 
   auto Symbols = runFuzzyFind(M, "");
@@ -334,7 +334,7 @@ TEST(FileIndexTest, RebuildWithPreamble) {
   IgnoreDiagnostics IgnoreDiags;
   auto CI = buildCompilerInvocation(PI, IgnoreDiags);
 
-  FileIndex Index;
+  FileIndex Index(true);
   bool IndexUpdated = false;
   buildPreamble(
       FooCpp, *CI, PI,
@@ -374,7 +374,7 @@ TEST(FileIndexTest, Refs) {
   RefsRequest Request;
   Request.IDs = {Foo.ID};
 
-  FileIndex Index;
+  FileIndex Index(true);
   // Add test.cc
   TestTU Test;
   Test.HeaderCode = HeaderCode;
@@ -409,7 +409,7 @@ TEST(FileIndexTest, MacroRefs) {
   }
   )cpp");
 
-  FileIndex Index;
+  FileIndex Index(true);
   // Add test.cc
   TestTU Test;
   Test.HeaderCode = std::string(HeaderCode.code());
@@ -432,7 +432,7 @@ TEST(FileIndexTest, MacroRefs) {
 }
 
 TEST(FileIndexTest, CollectMacros) {
-  FileIndex M;
+  FileIndex M(true);
   update(M, "f", "#define CLANGD 1");
   EXPECT_THAT(runFuzzyFind(M, ""), Contains(qName("CLANGD")));
 }
@@ -443,7 +443,7 @@ TEST(FileIndexTest, Relations) {
   TU.HeaderFilename = "f.h";
   TU.HeaderCode = "class A {}; class B : public A {};";
   auto AST = TU.build();
-  FileIndex Index;
+  FileIndex Index(true);
   Index.updatePreamble(testPath(TU.Filename), /*Version=*/"null",
                        AST.getASTContext(), AST.getPreprocessor(),
                        AST.getPragmaIncludes());
@@ -493,7 +493,7 @@ TEST(FileIndexTest, ReferencesInMainFileWithPreamble) {
   )cpp");
   TU.Code = std::string(Main.code());
   auto AST = TU.build();
-  FileIndex Index;
+  FileIndex Index(true);
   Index.updateMain(testPath(TU.Filename), AST);
 
   // Expect to see references in main file, references in headers are excluded
@@ -510,7 +510,7 @@ TEST(FileIndexTest, MergeMainFileSymbols) {
   Cpp.HeaderFilename = "foo.h";
   Cpp.HeaderCode = CommonHeader;
 
-  FileIndex Index;
+  FileIndex Index(true);
   auto HeaderAST = Header.build();
   auto CppAST = Cpp.build();
   Index.updateMain(testPath("foo.h"), HeaderAST);
@@ -524,7 +524,7 @@ TEST(FileIndexTest, MergeMainFileSymbols) {
 }
 
 TEST(FileSymbolsTest, CountReferencesNoRefSlabs) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   FS.update("f1", numSlab(1, 3), nullptr, nullptr, true);
   FS.update("f2", numSlab(1, 3), nullptr, nullptr, false);
   EXPECT_THAT(
@@ -536,7 +536,7 @@ TEST(FileSymbolsTest, CountReferencesNoRefSlabs) {
 }
 
 TEST(FileSymbolsTest, CountReferencesWithRefSlabs) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   FS.update("f1cpp", numSlab(1, 3), refSlab(SymbolID("1"), "f1.cpp"), nullptr,
             true);
   FS.update("f1h", numSlab(1, 3), refSlab(SymbolID("1"), "f1.h"), nullptr,
@@ -558,7 +558,7 @@ TEST(FileSymbolsTest, CountReferencesWithRefSlabs) {
 }
 
 TEST(FileIndexTest, StalePreambleSymbolsDeleted) {
-  FileIndex M;
+  FileIndex M(true);
   TestTU File;
   File.HeaderFilename = "a.h";
 
@@ -581,7 +581,7 @@ TEST(FileIndexTest, StalePreambleSymbolsDeleted) {
 
 // Verifies that concurrent calls to updateMain don't "lose" any updates.
 TEST(FileIndexTest, Threadsafety) {
-  FileIndex M;
+  FileIndex M(true);
   Notification Go;
 
   constexpr int Count = 10;
@@ -714,7 +714,7 @@ TEST(FileShardedIndexTest, Sharding) {
 }
 
 TEST(FileIndexTest, Profile) {
-  FileIndex FI;
+  FileIndex FI(true);
 
   auto FileName = testPath("foo.cpp");
   auto AST = TestTU::withHeaderCode("int a;").build();
@@ -738,7 +738,7 @@ TEST(FileIndexTest, Profile) {
 }
 
 TEST(FileSymbolsTest, Profile) {
-  FileSymbols FS(IndexContents::All);
+  FileSymbols FS(IndexContents::All, true);
   FS.update("f1", numSlab(1, 2), nullptr, nullptr, false);
   FS.update("f2", nullptr, refSlab(SymbolID("1"), "f1"), nullptr, false);
   FS.update("f3", nullptr, nullptr,
@@ -758,7 +758,7 @@ TEST(FileSymbolsTest, Profile) {
 }
 
 TEST(FileIndexTest, MacrosFromMainFile) {
-  FileIndex Idx;
+  FileIndex Idx(true);
   TestTU TU;
   TU.Code = "#pragma once\n#define FOO";
   TU.Filename = "foo.h";
diff --git a/clang-tools-extra/clangd/unittests/IndexTests.cpp b/clang-tools-extra/clangd/unittests/IndexTests.cpp
index 658b4e200004e..a66680d39c87d 100644
--- a/clang-tools-extra/clangd/unittests/IndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/IndexTests.cpp
@@ -292,7 +292,7 @@ TEST(MergeIndexTest, Lookup) {
 }
 
 TEST(MergeIndexTest, LookupRemovedDefinition) {
-  FileIndex DynamicIndex, StaticIndex;
+  FileIndex DynamicIndex(true), StaticIndex(true);
   MergedIndex Merge(&DynamicIndex, &StaticIndex);
 
   const char *HeaderCode = "class Foo;";
@@ -349,7 +349,7 @@ TEST(MergeIndexTest, FuzzyFind) {
 }
 
 TEST(MergeIndexTest, FuzzyFindRemovedSymbol) {
-  FileIndex DynamicIndex, StaticIndex;
+  FileIndex DynamicIndex(true), StaticIndex(true);
   MergedIndex Merge(&DynamicIndex, &StaticIndex);
 
   const char *HeaderCode = "class Foo;";
@@ -446,8 +446,8 @@ TEST(MergeTest, PreferSymbolLocationInCodegenFile) {
 }
 
 TEST(MergeIndexTest, Refs) {
-  FileIndex Dyn;
-  FileIndex StaticIndex;
+  FileIndex Dyn(true);
+  FileIndex StaticIndex(true);
   MergedIndex Merge(&Dyn, &StaticIndex);
 
   const char *HeaderCode = "class Foo;";
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index 7d9252110b27d..142ed171d1a1c 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1548,7 +1548,7 @@ TEST(CrossFileRenameTests, DirtyBuffer) {
   std::string BarPath = testPath("bar.cc");
   // Build the index, the index has "Foo" references from foo.cc and "Bar"
   // references from bar.cc.
-  FileSymbols FSymbols(IndexContents::All);
+  FileSymbols FSymbols(IndexContents::All, true);
   FSymbols.update(FooPath, nullptr, buildRefSlab(FooCode, "Foo", FooPath),
                   nullptr, false);
   FSymbols.update(BarPath, nullptr, buildRefSlab(BarCode, "Bar", BarPath),
@@ -1601,6 +1601,12 @@ TEST(CrossFileRenameTests, DirtyBuffer) {
       return true; // has more references
     }
 
+    bool containedRefs(const ContainedRefsRequest &Req,
+                       llvm::function_ref<void(const ContainedRefsResult &)>
+                           Callback) const override {
+      return false;
+    }
+
     bool fuzzyFind(
         const FuzzyFindRequest &Req,
         llvm::function_ref<void(const Symbol &)> Callback) const override {
@@ -1652,6 +1658,12 @@ TEST(CrossFileRenameTests, DeduplicateRefsFromIndex) {
       return false;
     }
 
+    bool containedRefs(const ContainedRefsRequest &Req,
+                       llvm::function_ref<void(const ContainedRefsResult &)>
+                           Callback) const override {
+      return false;
+    }
+
     bool fuzzyFind(const FuzzyFindRequest &,
                    llvm::function_ref<void(const Symbol &)>) const override {
       return false;
diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp
index 1f02c04125b1e..3f8990c86f714 100644
--- a/clang-tools-extra/clangd/unittests/TestTU.cpp
+++ b/clang-tools-extra/clangd/unittests/TestTU.cpp
@@ -174,7 +174,7 @@ RefSlab TestTU::headerRefs() const {
 
 std::unique_ptr<SymbolIndex> TestTU::index() const {
   auto AST = build();
-  auto Idx = std::make_unique<FileIndex>();
+  auto Idx = std::make_unique<FileIndex>(/*SupportContainedRefs=*/true);
   Idx->updatePreamble(testPath(Filename), /*Version=*/"null",
                       AST.getASTContext(), AST.getPreprocessor(),
                       AST.getPragmaIncludes());
diff --git a/clang-tools-extra/clangd/unittests/TestWorkspace.cpp b/clang-tools-extra/clangd/unittests/TestWorkspace.cpp
index 2130e7a4c6dd4..e9a50f1e8b63a 100644
--- a/clang-tools-extra/clangd/unittests/TestWorkspace.cpp
+++ b/clang-tools-extra/clangd/unittests/TestWorkspace.cpp
@@ -17,7 +17,7 @@ namespace clang {
 namespace clangd {
 
 std::unique_ptr<SymbolIndex> TestWorkspace::index() {
-  auto Index = std::make_unique<FileIndex>();
+  auto Index = std::make_unique<FileIndex>(/*SupportContainedRefs=*/true);
   for (const auto &Input : Inputs) {
     if (!Input.second.IsMainFile)
       continue;
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 453a91e3b504c..e00f86f7d0144 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -184,7 +184,8 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/return-const-ref-from-parameter>` check to
   diagnose potential dangling references when returning a ``const &`` parameter
   by using the conditional operator ``cond ? var1 : var2`` and no longer giving
-  false positives for functions which contain lambda.
+  false positives for functions which contain lambda and ignore parameters
+  with ``[[clang::lifetimebound]]`` attribute.
   
 - Improved :doc:`bugprone-sizeof-expression
   <clang-tidy/checks/bugprone/sizeof-expression>` check to find suspicious
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
index 2349e51477b7d..ba47399914de3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
@@ -12,15 +12,6 @@ after the call. When the function returns such a parameter also as constant refe
 then the returned reference can be used after the object it refers to has been
 destroyed.
 
-This issue can be resolved by declaring an overload of the problematic function
-where the ``const &`` parameter is instead declared as ``&&``. The developer has
-to ensure that the implementation of that function does not produce a
-use-after-free, the exact error that this check is warning against.
-Marking such an ``&&`` overload as ``deleted``, will silence the warning as 
-well. In the case of different ``const &`` parameters being returned depending
-on the control flow of the function, an overload where all problematic
-``const &`` parameters have been declared as ``&&`` will resolve the issue.
-
 Example
 -------
 
@@ -38,3 +29,23 @@ Example
 
   const S& s = fn(S{1});
   s.v; // use after free
+
+
+This issue can be resolved by declaring an overload of the problematic function
+where the ``const &`` parameter is instead declared as ``&&``. The developer has
+to ensure that the implementation of that function does not produce a
+use-after-free, the exact error that this check is warning against.
+Marking such an ``&&`` overload as ``deleted``, will silence the warning as 
+well. In the case of different ``const &`` parameters being returned depending
+on the control flow of the function, an overload where all problematic
+``const &`` parameters have been declared as ``&&`` will resolve the issue.
+
+This issue can also be resolved by adding ``[[clang::lifetimebound]]``. Clang
+enable ``-Wdangling`` warning by default which can detect mis-uses of the
+annotated function. See `lifetimebound attribute <https://clang.llvm.org/docs/AttributeReference.html#id11>`_
+for details.
+
+.. code-block:: c++
+
+  const int &f(const int &a [[clang::lifetimebound]]) { return a; } // no warning
+  const int &v = f(1); // warning: temporary bound to local reference 'v' will be destroyed at the end of the full-expression [-Wdangling]
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
index 49aeb50155b15..46cb9063beda9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp
@@ -197,3 +197,9 @@ int const &overload_params_difference3(int p1, int const &a, int p2) { return a;
 int const &overload_params_difference3(int p1, long &&a, int p2);
 
 } // namespace overload
+
+namespace gh117696 {
+namespace use_lifetime_bound_attr {
+int const &f(int const &a [[clang::lifetimebound]]) { return a; }
+} // namespace use_lifetime_bound_attr
+} // namespace gh117696
diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index e17d741b0a00e..c8f1d7f5a7758 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -49,7 +49,7 @@ to format C/C++/Java/JavaScript/JSON/Objective-C/Protobuf/C# code.
                                      supported:
                                        CSharp: .cs
                                        Java: .java
-                                       JavaScript: .mjs .js .ts
+                                       JavaScript: .js .mjs .cjs .ts
                                        Json: .json
                                        Objective-C: .m .mm
                                        Proto: .proto .protodevel
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index c053a5ab3c528..6b950d05fb9bf 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -648,7 +648,7 @@ elementwise to the input.
 Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±infinity
 
 The integer elementwise intrinsics, including ``__builtin_elementwise_popcount``,
-can be called in a ``constexpr`` context.
+``__builtin_elementwise_bitreverse``, can be called in a ``constexpr`` context.
 
 ============================================== ====================================================================== =========================================
          Name                                   Operation                                                             Supported element types
@@ -1989,7 +1989,7 @@ Enumerations with a fixed underlying type
 -----------------------------------------
 
 Clang provides support for C++11 enumerations with a fixed underlying type
-within Objective-C.  For example, one can write an enumeration type as:
+within Objective-C and C `prior to C23 <https://open-std.org/JTC1/SC22/WG14/www/docs/n3030.htm>`_.  For example, one can write an enumeration type as:
 
 .. code-block:: c++
 
@@ -2001,6 +2001,14 @@ value, is ``unsigned char``.
 Use ``__has_feature(objc_fixed_enum)`` to determine whether support for fixed
 underlying types is available in Objective-C.
 
+Use ``__has_extension(c_fixed_enum)`` to determine whether support for fixed
+underlying types is available in C prior to C23. This will also report ``true`` in C23
+and later modes as the functionality is available even if it's not an extension in
+those modes.
+
+Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed
+underlying types is available in C23 and later.
+
 Interoperability with C++11 lambdas
 -----------------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0bb2eb820cd72..755418e9550cf 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -310,6 +310,9 @@ Resolutions to C++ Defect Reports
   by default.
   (`CWG2521: User-defined literals and reserved identifiers <https://cplusplus.github.io/CWG/issues/2521.html>`_).
 
+- Fix name lookup for a dependent base class that is the current instantiation.
+  (`CWG591: When a dependent base class is the current instantiation <https://cplusplus.github.io/CWG/issues/591.html>`_).
+
 C Language Changes
 ------------------
 
@@ -404,6 +407,7 @@ Non-comprehensive list of changes in this release
 - ``__builtin_reduce_and`` function can now be used in constant expressions.
 - ``__builtin_reduce_or`` and ``__builtin_reduce_xor`` functions can now be used in constant expressions.
 - ``__builtin_elementwise_popcount`` function can now be used in constant expressions.
+- ``__builtin_elementwise_bitreverse`` function can now be used in constant expressions.
 
 New Compiler Flags
 ------------------
@@ -757,12 +761,15 @@ Bug Fixes to C++ Support
 - Name independent data members were not correctly initialized from default member initializers. (#GH114069)
 - Fixed expression transformation for ``[[assume(...)]]``, allowing using pack indexing expressions within the
   assumption if they also occur inside of a dependent lambda. (#GH114787)
+- Lambdas now capture function types without considering top-level const qualifiers. (#GH84961)
 - Clang now uses valid deduced type locations when diagnosing functions with trailing return type
   missing placeholder return type. (#GH78694)
 - Fixed a bug where bounds of partially expanded pack indexing expressions were checked too early. (#GH116105)
 - Fixed an assertion failure caused by using ``consteval`` in condition in consumed analyses. (#GH117385)
 - Fix a crash caused by incorrect argument position in merging deduced template arguments. (#GH113659)
 - Fixed an assertion failure caused by mangled names with invalid identifiers. (#GH112205)
+- Fixed an incorrect lambda scope of generic lambdas that caused Clang to crash when computing potential lambda
+  captures at the end of a full expression. (#GH115931)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -929,9 +936,15 @@ and `-mbulk-memory` flags, which correspond to the [Bulk Memory Operations]
 and [Non-trapping float-to-int Conversions] language features, which are
 [widely implemented in engines].
 
+A new Lime1 target CPU is added, -mcpu=lime1. This CPU follows the definition of
+the Lime1 CPU [here], and enables -mmultivalue, -mmutable-globals,
+-mcall-indirect-overlong, -msign-ext, -mbulk-memory-opt, -mnontrapping-fptoint,
+and -mextended-const.
+
 [Bulk Memory Operations]: https://github.com/WebAssembly/bulk-memory-operations/blob/master/proposals/bulk-memory-operations/Overview.md
 [Non-trapping float-to-int Conversions]: https://github.com/WebAssembly/spec/blob/master/proposals/nontrapping-float-to-int-conversion/Overview.md
 [widely implemented in engines]: https://webassembly.org/features/
+[here]: https://github.com/WebAssembly/tool-conventions/blob/main/Lime.md#lime1
 
 AVR Support
 ^^^^^^^^^^^
@@ -965,8 +978,14 @@ AST Matchers
 - Ensure ``hasName`` matches template specializations across inline namespaces,
   making `matchesNodeFullSlow` and `matchesNodeFullFast` consistent.
 
+- Improved the performance of the ``getExpansionLocOfMacro`` by tracking already processed macros during recursion.
+
 - Add ``exportDecl`` matcher to match export declaration.
 
+- Ensure ``hasType`` and ``hasDeclaration`` match Objective-C interface declarations.
+
+- Ensure ``pointee`` matches Objective-C pointer types.
+
 clang-format
 ------------
 
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index 705ff0d4d4098..b6f00657ec914 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -5,6 +5,7 @@
 
 import collections
 import re
+import os
 
 try:
     from urllib.request import urlopen
@@ -18,7 +19,11 @@
     CLASS_INDEX_PAGE = None
     print("Unable to get %s: %s" % (CLASS_INDEX_PAGE_URL, e))
 
-MATCHERS_FILE = "../../include/clang/ASTMatchers/ASTMatchers.h"
+CURRENT_DIR = os.path.dirname(__file__)
+MATCHERS_FILE = os.path.join(
+    CURRENT_DIR, "../../include/clang/ASTMatchers/ASTMatchers.h"
+)
+HTML_FILE = os.path.join(CURRENT_DIR, "../LibASTMatchersReference.html")
 
 # Each matcher is documented in one row of the form:
 #   result | name | argA
@@ -590,7 +595,7 @@ def sort_table(matcher_type, matcher_map):
 narrowing_matcher_table = sort_table("NARROWING", narrowing_matchers)
 traversal_matcher_table = sort_table("TRAVERSAL", traversal_matchers)
 
-reference = open("../LibASTMatchersReference.html").read()
+reference = open(HTML_FILE).read()
 reference = re.sub(
     r"<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->",
     node_matcher_table,
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 90a52b1dcbf62..6fd6c73a516f0 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3754,6 +3754,8 @@ class ArrayParameterType : public ConstantArrayType {
   static bool classof(const Type *T) {
     return T->getTypeClass() == ArrayParameter;
   }
+
+  QualType getConstantArrayType(const ASTContext &Ctx) const;
 };
 
 /// Represents a C array with an unspecified size.  For example 'int A[]' has
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index efad600a3c58c..897aa25dc95cc 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4044,7 +4044,7 @@ AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
 AST_POLYMORPHIC_MATCHER_P_OVERLOAD(
     hasType,
     AST_POLYMORPHIC_SUPPORTED_TYPES(Expr, FriendDecl, ValueDecl,
-                                    CXXBaseSpecifier),
+                                    CXXBaseSpecifier, ObjCInterfaceDecl),
     internal::Matcher<Decl>, InnerMatcher, 1) {
   QualType QT = internal::getUnderlyingType(Node);
   if (!QT.isNull())
@@ -7445,7 +7445,8 @@ extern const AstTypeMatcher<RValueReferenceType> rValueReferenceType;
 AST_TYPELOC_TRAVERSE_MATCHER_DECL(
     pointee, getPointee,
     AST_POLYMORPHIC_SUPPORTED_TYPES(BlockPointerType, MemberPointerType,
-                                    PointerType, ReferenceType));
+                                    PointerType, ReferenceType,
+                                    ObjCObjectPointerType));
 
 /// Matches typedef types.
 ///
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index ab8b146453e76..04804d5def046 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -161,6 +161,9 @@ inline QualType getUnderlyingType(const FriendDecl &Node) {
 inline QualType getUnderlyingType(const CXXBaseSpecifier &Node) {
   return Node.getType();
 }
+inline QualType getUnderlyingType(const ObjCInterfaceDecl &Node) {
+  return Node.getTypeForDecl()->getPointeeType();
+}
 
 /// Unifies obtaining a `TypeSourceInfo` from different node types.
 template <typename T,
@@ -1113,6 +1116,11 @@ class HasDeclarationMatcher : public MatcherInterface<T> {
     return matchesDecl(Node.getDecl(), Finder, Builder);
   }
 
+  bool matchesSpecialized(const ObjCInterfaceDecl &Node, ASTMatchFinder *Finder,
+                          BoundNodesTreeBuilder *Builder) const {
+    return matchesDecl(Node.getCanonicalDecl(), Finder, Builder);
+  }
+
   /// Extracts the operator new of the new call and returns whether the
   /// inner matcher matches on it.
   bool matchesSpecialized(const CXXNewExpr &Node,
@@ -1213,7 +1221,7 @@ using HasDeclarationSupportedTypes =
              ElaboratedType, InjectedClassNameType, LabelStmt, AddrLabelExpr,
              MemberExpr, QualType, RecordType, TagType,
              TemplateSpecializationType, TemplateTypeParmType, TypedefType,
-             UnresolvedUsingType, ObjCIvarRefExpr>;
+             UnresolvedUsingType, ObjCIvarRefExpr, ObjCInterfaceDecl>;
 
 /// A Matcher that allows binding the node it matches to an id.
 ///
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 130e91103da06..e2c3d3c535571 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1270,7 +1270,7 @@ def ElementwiseATan2 : Builtin {
 
 def ElementwiseBitreverse : Builtin {
   let Spellings = ["__builtin_elementwise_bitreverse"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
@@ -4930,6 +4930,12 @@ def HLSLClip: LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLGroupMemoryBarrierWithGroupSync: LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_group_memory_barrier_with_group_sync"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void()";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index ac0e178d1cb41..3ac490d30371b 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -294,11 +294,13 @@ def : DiagGroup<"c++1z-compat-mangling", [CXX17CompatMangling]>;
 // Name of this warning in GCC.
 def NoexceptType : DiagGroup<"noexcept-type", [CXX17CompatMangling]>;
 
+def VariadicMacroArgumentsOmitted : DiagGroup<"variadic-macro-arguments-omitted">;
+
 // Warnings for C code which is not compatible with previous C standards.
 def CPre11Compat : DiagGroup<"pre-c11-compat">;
 def CPre11CompatPedantic : DiagGroup<"pre-c11-compat-pedantic",
                                      [CPre11Compat]>;
-def CPre23Compat : DiagGroup<"pre-c23-compat">;
+def CPre23Compat : DiagGroup<"pre-c23-compat", [VariadicMacroArgumentsOmitted]>;
 def CPre23CompatPedantic : DiagGroup<"pre-c23-compat-pedantic",
                                      [CPre23Compat]>;
 def : DiagGroup<"pre-c2x-compat", [CPre23Compat]>;
@@ -906,7 +908,7 @@ def VolatileRegisterVar : DiagGroup<"volatile-register-var">;
 def Visibility : DiagGroup<"visibility">;
 def ZeroLengthArray : DiagGroup<"zero-length-array">;
 def GNUZeroLineDirective : DiagGroup<"gnu-zero-line-directive">;
-def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments">;
+def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments", [VariadicMacroArgumentsOmitted]>;
 def MisleadingIndentation : DiagGroup<"misleading-indentation">;
 def PtrAuthNullPointers : DiagGroup<"ptrauth-null-pointers">;
 
@@ -1199,7 +1201,7 @@ def CXX17 : DiagGroup<"c++17-extensions", [CXX17Attrs]>;
 
 // A warning group for warnings about using C++20 features as extensions in
 // earlier C++ versions.
-def CXX20 : DiagGroup<"c++20-extensions", [CXX20Designator, CXX20Attrs]>;
+def CXX20 : DiagGroup<"c++20-extensions", [CXX20Designator, CXX20Attrs, VariadicMacroArgumentsOmitted]>;
 
 // A warning group for warnings about using C++23 features as extensions in
 // earlier C++ versions.
@@ -1226,7 +1228,7 @@ def C11 : DiagGroup<"c11-extensions">;
 def C99 : DiagGroup<"c99-extensions", [C99Designator]>;
 
 // A warning group for warnings about using C23 features as extensions.
-def C23 : DiagGroup<"c23-extensions">;
+def C23 : DiagGroup<"c23-extensions", [VariadicMacroArgumentsOmitted]>;
 
 def : DiagGroup<"c2x-extensions", [C23]>;
 
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 889370221f32f..959376b084721 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -486,14 +486,14 @@ def ext_embedded_directive : Extension<
   InGroup<DiagGroup<"embedded-directive">>;
 def ext_c_missing_varargs_arg : Extension<
   "passing no argument for the '...' parameter of a variadic macro is "
-  "a C23 extension">, InGroup<C23>;
+  "a C23 extension">, InGroup<VariadicMacroArgumentsOmitted>;
 def ext_cxx_missing_varargs_arg : Extension<
   "passing no argument for the '...' parameter of a variadic macro is "
-  "a C++20 extension">, InGroup<CXX20>;
+  "a C++20 extension">, InGroup<VariadicMacroArgumentsOmitted>;
 def warn_c17_compat_missing_varargs_arg : Warning<
   "passing no argument for the '...' parameter of a variadic macro is "
   "incompatible with C standards before C23">,
-  InGroup<CPre23Compat>, DefaultIgnore;
+  InGroup<VariadicMacroArgumentsOmitted>, DefaultIgnore;
 def warn_cxx17_compat_missing_varargs_arg : Warning<
   "passing no argument for the '...' parameter of a variadic macro is "
   "incompatible with C++ standards before C++20">,
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 9088c867d53ce..15c59c6bcdf29 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -163,6 +163,8 @@ FEATURE(c_atomic, LangOpts.C11)
 FEATURE(c_generic_selections, LangOpts.C11)
 FEATURE(c_static_assert, LangOpts.C11)
 FEATURE(c_thread_local, LangOpts.C11 &&PP.getTargetInfo().isTLSSupported())
+// C23 features
+FEATURE(c_fixed_enum, LangOpts.C23)
 // C++11 features
 FEATURE(cxx_access_control_sfinae, LangOpts.CPlusPlus11)
 FEATURE(cxx_alias_templates, LangOpts.CPlusPlus11)
@@ -269,6 +271,7 @@ EXTENSION(c_static_assert, true)
 EXTENSION(c_thread_local, PP.getTargetInfo().isTLSSupported())
 // C23 features supported by other languages as extensions
 EXTENSION(c_attributes, true)
+EXTENSION(c_fixed_enum, true)
 // C++11 features supported by other languages as extensions.
 EXTENSION(cxx_atomic, LangOpts.CPlusPlus)
 EXTENSION(cxx_default_function_template_args, LangOpts.CPlusPlus)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9c356c9d2ea4e..4bc0b97ea68f2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1786,12 +1786,12 @@ defm debug_info_for_profiling : BoolFOption<"debug-info-for-profiling",
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
           "Emit extra debug info to make sample profile more accurate">,
   NegFlag<SetFalse>>;
-def fprofile_generate_cold_function_coverage : Flag<["-"], "fprofile-generate-cold-function-coverage">, 
+def fprofile_generate_cold_function_coverage : Flag<["-"], "fprofile-generate-cold-function-coverage">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
     HelpText<"Generate instrumented code to collect coverage info for cold functions into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
-def fprofile_generate_cold_function_coverage_EQ : Joined<["-"], "fprofile-generate-cold-function-coverage=">, 
+def fprofile_generate_cold_function_coverage_EQ : Joined<["-"], "fprofile-generate-cold-function-coverage=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<directory>">,
-    HelpText<"Generate instrumented code to collect coverage info for cold functions into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">; 
+    HelpText<"Generate instrumented code to collect coverage info for cold functions into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
@@ -5092,6 +5092,10 @@ def matomics : Flag<["-"], "matomics">, Group<m_wasm_Features_Group>;
 def mno_atomics : Flag<["-"], "mno-atomics">, Group<m_wasm_Features_Group>;
 def mbulk_memory : Flag<["-"], "mbulk-memory">, Group<m_wasm_Features_Group>;
 def mno_bulk_memory : Flag<["-"], "mno-bulk-memory">, Group<m_wasm_Features_Group>;
+def mbulk_memory_opt : Flag<["-"], "mbulk-memory-opt">, Group<m_wasm_Features_Group>;
+def mno_bulk_memory_opt : Flag<["-"], "mno-bulk-memory-opt">, Group<m_wasm_Features_Group>;
+def mcall_indirect_overlong : Flag<["-"], "mcall-indirect-overlong">, Group<m_wasm_Features_Group>;
+def mno_call_indirect_overlong : Flag<["-"], "mno-call-indirect-overlong">, Group<m_wasm_Features_Group>;
 def mexception_handing : Flag<["-"], "mexception-handling">, Group<m_wasm_Features_Group>;
 def mno_exception_handing : Flag<["-"], "mno-exception-handling">, Group<m_wasm_Features_Group>;
 def mextended_const : Flag<["-"], "mextended-const">, Group<m_wasm_Features_Group>;
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index a95353fd2943c..eb102f1e5c7f2 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1000,7 +1000,10 @@ bool Compiler<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
     if (!visitAsPointer(RHS, *RT) || !visitAsPointer(LHS, *LT))
       return false;
 
-    return this->emitSubPtr(classifyPrim(E->getType()), E);
+    PrimType IntT = classifyPrim(E->getType());
+    if (!this->emitSubPtr(IntT, E))
+      return false;
+    return DiscardResult ? this->emitPop(IntT, E) : true;
   }
 
   PrimType OffsetType;
@@ -5911,6 +5914,9 @@ bool Compiler<Emitter>::VisitVectorUnaryOperator(const UnaryOperator *E) {
     return this->discard(SubExpr);
 
   auto UnaryOp = E->getOpcode();
+  if (UnaryOp == UO_Extension)
+    return this->delegate(SubExpr);
+
   if (UnaryOp != UO_Plus && UnaryOp != UO_Minus && UnaryOp != UO_LNot &&
       UnaryOp != UO_Not && UnaryOp != UO_AddrOf)
     return this->emitInvalid(E);
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index db3703a60db69..85cffb0c4332d 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -148,6 +148,17 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC,
 #undef RET_CASE
 }
 
+static void diagnoseNonConstexprBuiltin(InterpState &S, CodePtr OpPC,
+                                        unsigned ID) {
+  auto Loc = S.Current->getSource(OpPC);
+  if (S.getLangOpts().CPlusPlus11)
+    S.CCEDiag(Loc, diag::note_constexpr_invalid_function)
+        << /*isConstexpr=*/0 << /*isConstructor=*/0
+        << ("'" + S.getASTContext().BuiltinInfo.getName(ID) + "'").str();
+  else
+    S.CCEDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
+}
+
 static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
                                                   const InterpFrame *Frame,
                                                   const CallExpr *Call) {
@@ -181,10 +192,14 @@ static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
 
 static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
-                                   const CallExpr *Call) {
+                                   const Function *Func, const CallExpr *Call) {
+  unsigned ID = Func->getBuiltinID();
   const Pointer &A = getParam<Pointer>(Frame, 0);
   const Pointer &B = getParam<Pointer>(Frame, 1);
 
+  if (ID == Builtin::BIstrcmp)
+    diagnoseNonConstexprBuiltin(S, OpPC, ID);
+
   if (!CheckLive(S, OpPC, A, AK_Read) || !CheckLive(S, OpPC, B, AK_Read))
     return false;
 
@@ -224,9 +239,13 @@ static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
 
 static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
-                                   const CallExpr *Call) {
+                                   const Function *Func, const CallExpr *Call) {
+  unsigned ID = Func->getBuiltinID();
   const Pointer &StrPtr = getParam<Pointer>(Frame, 0);
 
+  if (ID == Builtin::BIstrlen)
+    diagnoseNonConstexprBuiltin(S, OpPC, ID);
+
   if (!CheckArray(S, OpPC, StrPtr))
     return false;
 
@@ -1772,6 +1791,7 @@ static bool interp__builtin_elementwise_popcount(InterpState &S, CodePtr OpPC,
     INT_TYPE_SWITCH_NO_BOOL(ElemT, {
       Dst.atIndex(I).deref<T>() =
           T::from(Arg.atIndex(I).deref<T>().toAPSInt().popcount());
+      Dst.atIndex(I).initialize();
     });
   }
 
@@ -1781,14 +1801,17 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const Function *Func, const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
+  unsigned ID = Func->getBuiltinID();
   Pointer DestPtr = getParam<Pointer>(Frame, 0);
   const Pointer &SrcPtr = getParam<Pointer>(Frame, 1);
   const APSInt &Size =
       peekToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(2)));
   assert(!Size.isSigned() && "memcpy and friends take an unsigned size");
 
-  if (DestPtr.isDummy() || SrcPtr.isDummy())
-    return false;
+  if (ID == Builtin::BImemcpy || ID == Builtin::BImemmove)
+    diagnoseNonConstexprBuiltin(S, OpPC, ID);
+
+  bool Move = (ID == Builtin::BI__builtin_memmove || ID == Builtin::BImemmove);
 
   // If the size is zero, we treat this as always being a valid no-op.
   if (Size.isZero()) {
@@ -1796,6 +1819,18 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     return true;
   }
 
+  if (SrcPtr.isZero() || DestPtr.isZero()) {
+    Pointer DiagPtr = (SrcPtr.isZero() ? SrcPtr : DestPtr);
+    S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_null)
+        << /*IsMove=*/Move << /*IsWchar=*/false << !SrcPtr.isZero()
+        << DiagPtr.toDiagnosticString(S.getASTContext());
+    return false;
+  }
+
+  // As a last resort, reject dummy pointers.
+  if (DestPtr.isDummy() || SrcPtr.isDummy())
+    return false;
+
   if (!DoBitCastPtr(S, OpPC, SrcPtr, DestPtr))
     return false;
 
@@ -1818,11 +1853,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
   case Builtin::BI__assume:
     break;
   case Builtin::BI__builtin_strcmp:
-    if (!interp__builtin_strcmp(S, OpPC, Frame, Call))
+  case Builtin::BIstrcmp:
+    if (!interp__builtin_strcmp(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_strlen:
-    if (!interp__builtin_strlen(S, OpPC, Frame, Call))
+  case Builtin::BIstrlen:
+    if (!interp__builtin_strlen(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_nan:
@@ -2263,6 +2300,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     break;
 
   case Builtin::BI__builtin_memcpy:
+  case Builtin::BImemcpy:
+  case Builtin::BI__builtin_memmove:
+  case Builtin::BImemmove:
     if (!interp__builtin_memcpy(S, OpPC, Frame, F, Call))
       return false;
     break;
diff --git a/clang/lib/AST/CXXInheritance.cpp b/clang/lib/AST/CXXInheritance.cpp
index aefc06e9197cf..10b8d524ff897 100644
--- a/clang/lib/AST/CXXInheritance.cpp
+++ b/clang/lib/AST/CXXInheritance.cpp
@@ -134,7 +134,7 @@ bool CXXRecordDecl::forallBases(ForallBasesCallback BaseMatches) const {
         return false;
 
       CXXRecordDecl *Base =
-            cast_or_null<CXXRecordDecl>(Ty->getDecl()->getDefinition());
+          cast_if_present<CXXRecordDecl>(Ty->getDecl()->getDefinition());
       if (!Base ||
           (Base->isDependentContext() &&
            !Base->isCurrentInstantiation(Record))) {
@@ -169,13 +169,21 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
     QualType BaseType =
         Context.getCanonicalType(BaseSpec.getType()).getUnqualifiedType();
 
+    bool isCurrentInstantiation = isa<InjectedClassNameType>(BaseType);
+    if (!isCurrentInstantiation) {
+      if (auto *BaseRecord = cast_if_present<CXXRecordDecl>(
+              BaseSpec.getType()->getAsRecordDecl()))
+        isCurrentInstantiation = BaseRecord->isDependentContext() &&
+                                 BaseRecord->isCurrentInstantiation(Record);
+    }
     // C++ [temp.dep]p3:
     //   In the definition of a class template or a member of a class template,
     //   if a base class of the class template depends on a template-parameter,
     //   the base class scope is not examined during unqualified name lookup
     //   either at the point of definition of the class template or member or
     //   during an instantiation of the class tem- plate or member.
-    if (!LookupInDependent && BaseType->isDependentType())
+    if (!LookupInDependent &&
+        (BaseType->isDependentType() && !isCurrentInstantiation))
       continue;
 
     // Determine whether we need to visit this base class at all,
@@ -243,9 +251,8 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
         return FoundPath;
       }
     } else if (VisitBase) {
-      CXXRecordDecl *BaseRecord;
+      CXXRecordDecl *BaseRecord = nullptr;
       if (LookupInDependent) {
-        BaseRecord = nullptr;
         const TemplateSpecializationType *TST =
             BaseSpec.getType()->getAs<TemplateSpecializationType>();
         if (!TST) {
@@ -264,8 +271,7 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
             BaseRecord = nullptr;
         }
       } else {
-        BaseRecord = cast<CXXRecordDecl>(
-            BaseSpec.getType()->castAs<RecordType>()->getDecl());
+        BaseRecord = cast<CXXRecordDecl>(BaseSpec.getType()->getAsRecordDecl());
       }
       if (BaseRecord &&
           lookupInBases(Context, BaseRecord, BaseMatches, LookupInDependent)) {
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index f2f2835641245..af73c658d6a0c 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2602,8 +2602,6 @@ bool CXXMethodDecl::isMoveAssignmentOperator() const {
 
 void CXXMethodDecl::addOverriddenMethod(const CXXMethodDecl *MD) {
   assert(MD->isCanonicalDecl() && "Method is not canonical!");
-  assert(!MD->getParent()->isDependentContext() &&
-         "Can't add an overridden method to a class template!");
   assert(MD->isVirtual() && "Method is not virtual!");
 
   getASTContext().addOverriddenMethod(this, MD);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index bb5ab67328fbc..6b5b95aee3552 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11310,7 +11310,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   switch (E->getBuiltinCallee()) {
   default:
     return false;
-  case Builtin::BI__builtin_elementwise_popcount: {
+  case Builtin::BI__builtin_elementwise_popcount:
+  case Builtin::BI__builtin_elementwise_bitreverse: {
     APValue Source;
     if (!EvaluateAsRValue(Info, E->getArg(0), Source))
       return false;
@@ -11322,9 +11323,18 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
       APSInt Elt = Source.getVectorElt(EltNum).getInt();
-      ResultElements.push_back(
-          APValue(APSInt(APInt(Info.Ctx.getIntWidth(DestEltTy), Elt.popcount()),
-                         DestEltTy->isUnsignedIntegerOrEnumerationType())));
+      switch (E->getBuiltinCallee()) {
+      case Builtin::BI__builtin_elementwise_popcount:
+        ResultElements.push_back(APValue(
+            APSInt(APInt(Info.Ctx.getIntWidth(DestEltTy), Elt.popcount()),
+                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+        break;
+      case Builtin::BI__builtin_elementwise_bitreverse:
+        ResultElements.push_back(
+            APValue(APSInt(Elt.reverseBits(),
+                           DestEltTy->isUnsignedIntegerOrEnumerationType())));
+        break;
+      }
     }
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
@@ -12833,7 +12843,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
   case Builtin::BI__builtin_bitreverse8:
   case Builtin::BI__builtin_bitreverse16:
   case Builtin::BI__builtin_bitreverse32:
-  case Builtin::BI__builtin_bitreverse64: {
+  case Builtin::BI__builtin_bitreverse64:
+  case Builtin::BI__builtin_elementwise_bitreverse: {
     APSInt Val;
     if (!EvaluateInteger(E->getArg(0), Val, Info))
       return false;
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 94a7ce6c1321d..7642ff7ca606c 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3372,7 +3372,15 @@ void MicrosoftCXXNameMangler::mangleType(const MemberPointerType *T,
 
 void MicrosoftCXXNameMangler::mangleType(const TemplateTypeParmType *T,
                                          Qualifiers, SourceRange Range) {
-  Error(Range.getBegin(), "template type parameter type") << Range;
+  Out << '?';
+
+  llvm::SmallString<64> Name;
+  Name += "<TTPT_";
+  Name += llvm::utostr(T->getDepth());
+  Name += "_";
+  Name += llvm::utostr(T->getIndex());
+  Name += ">";
+  mangleSourceName(Name);
 }
 
 void MicrosoftCXXNameMangler::mangleType(const SubstTemplateTypeParmPackType *T,
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 366bcc3216b3f..976361d07b68b 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -267,6 +267,12 @@ void ConstantArrayType::Profile(llvm::FoldingSetNodeID &ID,
     SizeExpr->Profile(ID, Context, true);
 }
 
+QualType ArrayParameterType::getConstantArrayType(const ASTContext &Ctx) const {
+  return Ctx.getConstantArrayType(getElementType(), getSize(), getSizeExpr(),
+                                  getSizeModifier(),
+                                  getIndexTypeQualifiers().getAsOpaqueValue());
+}
+
 DependentSizedArrayType::DependentSizedArrayType(QualType et, QualType can,
                                                  Expr *e, ArraySizeModifier sm,
                                                  unsigned tq,
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index cdbdb65195409..bf9dc5f2373f9 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -21,6 +21,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -697,20 +698,27 @@ static bool isTokenAtLoc(const SourceManager &SM, const LangOptions &LangOpts,
   return !Invalid && Text == TokenText;
 }
 
-std::optional<SourceLocation>
-getExpansionLocOfMacro(StringRef MacroName, SourceLocation Loc,
-                       const ASTContext &Context) {
+static std::optional<SourceLocation> getExpansionLocOfMacroRecursive(
+    StringRef MacroName, SourceLocation Loc, const ASTContext &Context,
+    llvm::DenseSet<SourceLocation> &CheckedLocations) {
   auto &SM = Context.getSourceManager();
   const LangOptions &LangOpts = Context.getLangOpts();
   while (Loc.isMacroID()) {
+    if (CheckedLocations.count(Loc))
+      return std::nullopt;
+    CheckedLocations.insert(Loc);
     SrcMgr::ExpansionInfo Expansion =
         SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();
-    if (Expansion.isMacroArgExpansion())
+    if (Expansion.isMacroArgExpansion()) {
       // Check macro argument for an expansion of the given macro. For example,
       // `F(G(3))`, where `MacroName` is `G`.
-      if (std::optional<SourceLocation> ArgLoc = getExpansionLocOfMacro(
-              MacroName, Expansion.getSpellingLoc(), Context))
+      if (std::optional<SourceLocation> ArgLoc =
+              getExpansionLocOfMacroRecursive(MacroName,
+                                              Expansion.getSpellingLoc(),
+                                              Context, CheckedLocations)) {
         return ArgLoc;
+      }
+    }
     Loc = Expansion.getExpansionLocStart();
     if (isTokenAtLoc(SM, LangOpts, MacroName, Loc))
       return Loc;
@@ -718,6 +726,14 @@ getExpansionLocOfMacro(StringRef MacroName, SourceLocation Loc,
   return std::nullopt;
 }
 
+std::optional<SourceLocation>
+getExpansionLocOfMacro(StringRef MacroName, SourceLocation Loc,
+                       const ASTContext &Context) {
+  llvm::DenseSet<SourceLocation> CheckedLocations;
+  return getExpansionLocOfMacroRecursive(MacroName, Loc, Context,
+                                         CheckedLocations);
+}
+
 std::shared_ptr<llvm::Regex> createAndVerifyRegex(StringRef Regex,
                                                   llvm::Regex::RegexFlags Flags,
                                                   StringRef MatcherID) {
@@ -1098,7 +1114,8 @@ AST_TYPELOC_TRAVERSE_MATCHER_DEF(hasValueType,
 AST_TYPELOC_TRAVERSE_MATCHER_DEF(
     pointee,
     AST_POLYMORPHIC_SUPPORTED_TYPES(BlockPointerType, MemberPointerType,
-                                    PointerType, ReferenceType));
+                                    PointerType, ReferenceType,
+                                    ObjCObjectPointerType));
 
 const internal::VariadicDynCastAllOfMatcher<Stmt, OMPExecutableDirective>
     ompExecutableDirective;
diff --git a/clang/lib/ASTMatchers/Dynamic/Registry.cpp b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
index 8d36ad5c80b5d..837633fb2f060 100644
--- a/clang/lib/ASTMatchers/Dynamic/Registry.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Registry.cpp
@@ -243,6 +243,7 @@ RegistryMaps::RegistryMaps() {
   REGISTER_MATCHER(equalsBoundNode);
   REGISTER_MATCHER(equalsIntegralValue);
   REGISTER_MATCHER(explicitCastExpr);
+  REGISTER_MATCHER(exportDecl);
   REGISTER_MATCHER(expr);
   REGISTER_MATCHER(exprWithCleanups);
   REGISTER_MATCHER(fieldDecl);
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index 0b380bdf835ff..85e550ad20d5e 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -31,7 +31,7 @@ static constexpr Builtin::Info BuiltinInfo[] = {
 };
 
 static constexpr llvm::StringLiteral ValidCPUNames[] = {
-    {"mvp"}, {"bleeding-edge"}, {"generic"}};
+    {"mvp"}, {"bleeding-edge"}, {"generic"}, {"lime"}};
 
 StringRef WebAssemblyTargetInfo::getABI() const { return ABI; }
 
@@ -47,6 +47,8 @@ bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("atomics", HasAtomics)
       .Case("bulk-memory", HasBulkMemory)
+      .Case("bulk-memory-opt", HasBulkMemoryOpt)
+      .Case("call-indirect-overlong", HasCallIndirectOverlong)
       .Case("exception-handling", HasExceptionHandling)
       .Case("extended-const", HasExtendedConst)
       .Case("fp16", HasFP16)
@@ -79,6 +81,8 @@ void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__wasm_atomics__");
   if (HasBulkMemory)
     Builder.defineMacro("__wasm_bulk_memory__");
+  if (HasBulkMemoryOpt)
+    Builder.defineMacro("__wasm_bulk_memory_opt__");
   if (HasExceptionHandling)
     Builder.defineMacro("__wasm_exception_handling__");
   if (HasExtendedConst)
@@ -155,12 +159,25 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     const std::vector<std::string> &FeaturesVec) const {
   auto addGenericFeatures = [&]() {
     Features["bulk-memory"] = true;
+    Features["bulk-memory-opt"] = true;
+    Features["call-indirect-overlong"] = true;
     Features["multivalue"] = true;
     Features["mutable-globals"] = true;
     Features["nontrapping-fptoint"] = true;
     Features["reference-types"] = true;
     Features["sign-ext"] = true;
   };
+  auto addLime1Features = [&]() {
+    // Lime1:
+    // <https://github.com/WebAssembly/tool-conventions/blob/main/Lime.md#lime1>
+    Features["bulk-memory-opt"] = true;
+    Features["call-indirect-overlong"] = true;
+    Features["extended-const"] = true;
+    Features["multivalue"] = true;
+    Features["mutable-globals"] = true;
+    Features["nontrapping-fptoint"] = true;
+    Features["sign-ext"] = true;
+  };
   auto addBleedingEdgeFeatures = [&]() {
     addGenericFeatures();
     Features["atomics"] = true;
@@ -174,6 +191,8 @@ bool WebAssemblyTargetInfo::initFeatureMap(
   };
   if (CPU == "generic") {
     addGenericFeatures();
+  } else if (CPU == "lime1") {
+    addLime1Features();
   } else if (CPU == "bleeding-edge") {
     addBleedingEdgeFeatures();
   }
@@ -200,6 +219,22 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasBulkMemory = false;
       continue;
     }
+    if (Feature == "+bulk-memory-opt") {
+      HasBulkMemoryOpt = true;
+      continue;
+    }
+    if (Feature == "-bulk-memory-opt") {
+      HasBulkMemoryOpt = false;
+      continue;
+    }
+    if (Feature == "+call-indirect-overlong") {
+      HasCallIndirectOverlong = true;
+      continue;
+    }
+    if (Feature == "-call-indirect-overlong") {
+      HasCallIndirectOverlong = false;
+      continue;
+    }
     if (Feature == "+exception-handling") {
       HasExceptionHandling = true;
       continue;
@@ -310,6 +345,18 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
         << Feature << "-target-feature";
     return false;
   }
+
+  // bulk-memory-opt is a subset of bulk-memory.
+  if (HasBulkMemory) {
+    HasBulkMemoryOpt = true;
+  }
+
+  // The reference-types feature included the change to `call_indirect`
+  // encodings to support overlong immediates.
+  if (HasReferenceTypes) {
+    HasCallIndirectOverlong = true;
+  }
+
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index d8ed88b4c840f..c92ed161a92a7 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -55,6 +55,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
 
   bool HasAtomics = false;
   bool HasBulkMemory = false;
+  bool HasBulkMemoryOpt = false;
+  bool HasCallIndirectOverlong = false;
   bool HasExceptionHandling = false;
   bool HasExtendedConst = false;
   bool HasFP16 = false;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 8c31bbe056741..1b16888a0711b 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1162,6 +1162,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("pconfig", true)
       .Case("pku", true)
       .Case("popcnt", true)
+      .Case("prefer-256-bit", true)
       .Case("prefetchi", true)
       .Case("prfchw", true)
       .Case("ptwrite", true)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a54dd884c7fa5..7588f8427cdd3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19456,6 +19456,12 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
     assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
            "clip operands types mismatch");
     return handleHlslClip(E, this);
+  case Builtin::BI__builtin_hlsl_group_memory_barrier_with_group_sync: {
+    Intrinsic::ID ID =
+        CGM.getHLSLRuntime().getGroupMemoryBarrierWithGroupSyncIntrinsic();
+    return EmitRuntimeCall(
+        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28a5526fbea06..7c8d962fa5a92 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4725,15 +4725,17 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     return emitWritebackArg(*this, args, CRE);
   }
 
-  assert(type->isReferenceType() == E->isGLValue() &&
-         "reference binding to unmaterialized r-value!");
-
   // Add writeback for HLSLOutParamExpr.
+  // Needs to be before the assert below because HLSLOutArgExpr is an LValue
+  // and is not a reference.
   if (const HLSLOutArgExpr *OE = dyn_cast<HLSLOutArgExpr>(E)) {
     EmitHLSLOutArgExpr(OE, args, type);
     return;
   }
 
+  assert(type->isReferenceType() == E->isGLValue() &&
+         "reference binding to unmaterialized r-value!");
+
   if (E->isGLValue()) {
     assert(E->getObjectKind() == OK_Ordinary);
     return args.add(EmitReferenceBindingToExpr(E), type);
@@ -5322,6 +5324,14 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
           IRCallArgs[FirstIRArg] = Val;
           break;
         }
+      } else if (I->getType()->isArrayParameterType()) {
+        // Don't produce a temporary for ArrayParameterType arguments.
+        // ArrayParameterType arguments are only created from
+        // HLSL_ArrayRValue casts and HLSLOutArgExpr expressions, both
+        // of which create temporaries already. This allows us to just use the
+        // scalar for the decayed array pointer as the argument directly.
+        IRCallArgs[FirstIRArg] = I->getKnownRValue().getScalarVal();
+        break;
       }
 
       // For non-aggregate args and aggregate args meeting conditions above
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 229f0e29f0234..5fccc9cbb37ec 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5827,9 +5827,12 @@ LValue CodeGenFunction::EmitBinaryOperatorLValue(const BinaryOperator *E) {
 // This function implements trivial copy assignment for HLSL's
 // assignable constant arrays.
 LValue CodeGenFunction::EmitHLSLArrayAssignLValue(const BinaryOperator *E) {
-  LValue TrivialAssignmentRHS = EmitLValue(E->getRHS());
+  // Don't emit an LValue for the RHS because it might not be an LValue
   LValue LHS = EmitLValue(E->getLHS());
-  EmitAggregateAssign(LHS, TrivialAssignmentRHS, E->getLHS()->getType());
+  // In C the RHS of an assignment operator is an RValue.
+  // EmitAggregateAssign takes anan LValue for the RHS. Instead we can call
+  // EmitInitializationToLValue to emit an RValue into an LValue.
+  EmitInitializationToLValue(E->getRHS(), LHS);
   return LHS;
 }
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 854214d6bc067..bb120c8b5e9e6 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -103,6 +103,8 @@ class CGHLSLRuntime {
 
   GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding, handle_fromBinding)
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, bufferUpdateCounter)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
+                                   group_memory_barrier_with_group_sync)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 8c18c88fbde7f..72c0787d7df99 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -777,6 +777,13 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
 
   addFortranDialectOptions(Args, CmdArgs);
 
+  // 'flang -E' always produces output that is suitable for use as fixed form
+  // Fortran. However it is only valid free form source if the original is also
+  // free form.
+  if (InputType == types::TY_PP_Fortran &&
+      !Args.getLastArg(options::OPT_ffixed_form, options::OPT_ffree_form))
+    CmdArgs.push_back("-ffixed-form");
+
   handleColorDiagnosticsArgs(D, Args, CmdArgs);
 
   // LTO mode is parsed by the Clang driver library.
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index ee52972ce66f4..dcaac4b0d42cc 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3950,6 +3950,7 @@ static FormatStyle::LanguageKind getLanguageByFileName(StringRef FileName) {
     return FormatStyle::LK_Java;
   if (FileName.ends_with_insensitive(".js") ||
       FileName.ends_with_insensitive(".mjs") ||
+      FileName.ends_with_insensitive(".cjs") ||
       FileName.ends_with_insensitive(".ts")) {
     return FormatStyle::LK_JavaScript; // (module) JavaScript or TypeScript.
   }
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index a3e0b5c65a6f5..1126e13600f8a 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -2481,5 +2481,17 @@ float3 radians(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_radians)
 float4 radians(float4);
 
+//===----------------------------------------------------------------------===//
+// GroupMemoryBarrierWithGroupSync builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn void GroupMemoryBarrierWithGroupSync(void)
+/// \brief Blocks execution of all threads in a group until all group shared
+/// accesses have been completed and all threads in the group have reached this
+/// call.
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_group_memory_barrier_with_group_sync)
+void GroupMemoryBarrierWithGroupSync(void);
+
 } // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 942e7ece4283e..d6517511d7db4 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -723,6 +723,15 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
   QualType ExprTy = Context.getCanonicalType(E->getType());
   QualType TypeTy = Context.getCanonicalType(Ty);
 
+  // This cast is used in place of a regular LValue to RValue cast for
+  // HLSL Array Parameter Types. It needs to be emitted even if
+  // ExprTy == TypeTy, except if E is an HLSLOutArgExpr
+  // Emitting a cast in that case will prevent HLSLOutArgExpr from
+  // being handled properly in EmitCallArg
+  if (Kind == CK_HLSLArrayRValue && !isa<HLSLOutArgExpr>(E))
+    return ImplicitCastExpr::Create(Context, Ty, Kind, E, BasePath, VK,
+                                    CurFPFeatureOverrides());
+
   if (ExprTy == TypeTy)
     return E;
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ba574307055c6..c4bb73b2924bc 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15519,10 +15519,25 @@ LambdaScopeInfo *Sema::RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator) {
   LSI->CallOperator = CallOperator;
   LSI->Lambda = LambdaClass;
   LSI->ReturnType = CallOperator->getReturnType();
-  // This function in calls in situation where the context of the call operator
-  // is not entered, so we set AfterParameterList to false, so that
+  // When this function is called in situation where the context of the call
+  // operator is not entered, we set AfterParameterList to false, so that
   // `tryCaptureVariable` finds explicit captures in the appropriate context.
-  LSI->AfterParameterList = false;
+  // There is also at least a situation as in FinishTemplateArgumentDeduction(),
+  // where we would set the CurContext to the lambda operator before
+  // substituting into it. In this case the flag needs to be true such that
+  // tryCaptureVariable can correctly handle potential captures thereof.
+  LSI->AfterParameterList = CurContext == CallOperator;
+
+  // GLTemplateParameterList is necessary for getCurGenericLambda() which is
+  // used at the point of dealing with potential captures.
+  //
+  // We don't use LambdaClass->isGenericLambda() because this value doesn't
+  // flip for instantiated generic lambdas, where no FunctionTemplateDecls are
+  // associated. (Technically, we could recover that list from their
+  // instantiation patterns, but for now, the GLTemplateParameterList seems
+  // unnecessary in these cases.)
+  if (FunctionTemplateDecl *FTD = CallOperator->getDescribedFunctionTemplate())
+    LSI->GLTemplateParameterList = FTD->getTemplateParameters();
   const LambdaCaptureDefault LCD = LambdaClass->getLambdaCaptureDefault();
 
   if (LCD == LCD_None)
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 1cb07368e8308..adad14cc0f1f6 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -18431,7 +18431,11 @@ static bool isVariableAlreadyCapturedInScopeInfo(CapturingScopeInfo *CSI,
     // are mutable in the sense that user can change their value - they are
     // private instances of the captured declarations.
     const Capture &Cap = CSI->getCapture(Var);
-    if (Cap.isCopyCapture() &&
+    // C++ [expr.prim.lambda]p10:
+    //   The type of such a data member is [...] an lvalue reference to the
+    //   referenced function type if the entity is a reference to a function.
+    //   [...]
+    if (Cap.isCopyCapture() && !DeclRefType->isFunctionType() &&
         !(isa<LambdaScopeInfo>(CSI) &&
           !cast<LambdaScopeInfo>(CSI)->lambdaCaptureShouldBeConst()) &&
         !(isa<CapturedRegionScopeInfo>(CSI) &&
@@ -18741,7 +18745,12 @@ static bool captureInLambda(LambdaScopeInfo *LSI, ValueDecl *Var,
     //   parameter-declaration-clause is not followed by mutable.
     DeclRefType = CaptureType.getNonReferenceType();
     bool Const = LSI->lambdaCaptureShouldBeConst();
-    if (Const && !CaptureType->isReferenceType())
+    // C++ [expr.prim.lambda]p10:
+    //   The type of such a data member is [...] an lvalue reference to the
+    //   referenced function type if the entity is a reference to a function.
+    //   [...]
+    if (Const && !CaptureType->isReferenceType() &&
+        !DeclRefType->isFunctionType())
       DeclRefType.addConst();
   }
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d85819b21c826..f58c0fa21e838 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -4431,10 +4431,21 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
     break;
 
   case ICK_HLSL_Array_RValue:
-    FromType = Context.getArrayParameterType(FromType);
-    From = ImpCastExprToType(From, FromType, CK_HLSLArrayRValue, VK_PRValue,
-                             /*BasePath=*/nullptr, CCK)
-               .get();
+    if (ToType->isArrayParameterType()) {
+      FromType = Context.getArrayParameterType(FromType);
+      From = ImpCastExprToType(From, FromType, CK_HLSLArrayRValue, VK_PRValue,
+                               /*BasePath=*/nullptr, CCK)
+                 .get();
+    } else { // FromType must be ArrayParameterType
+      assert(FromType->isArrayParameterType() &&
+             "FromType must be ArrayParameterType in ICK_HLSL_Array_RValue \
+              if it is not ToType");
+      const ArrayParameterType *APT = cast<ArrayParameterType>(FromType);
+      FromType = APT->getConstantArrayType(Context);
+      From = ImpCastExprToType(From, FromType, CK_HLSLArrayRValue, VK_PRValue,
+                               /*BasePath=*/nullptr, CCK)
+                 .get();
+    }
     break;
 
   case ICK_Function_To_Pointer:
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 654f3cd97c1c5..060df967322ac 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -694,14 +694,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  // TODO OpenACC:  Remove this check when we have combined constructs for this
-  // clause.
-  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -730,6 +722,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause(
   // OpenACC 3.3 Section 2.5.4:
   // A reduction clause may not appear on a parallel construct with a
   // num_gangs clause that has more than one argument.
+  // TODO: OpenACC: Reduction on Combined Construct needs to do this too.
   if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel &&
       Clause.getIntExprs().size() > 1) {
     auto *Parallel =
@@ -751,13 +744,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitNumWorkersClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  // TODO: OpenACC: Remove when we get combined constructs.
-  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -773,13 +759,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitNumWorkersClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  // TODO: OpenACC: Remove when we get combined constructs.
-  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 4c9e37bd286de..c174922a926fc 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2236,33 +2236,24 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
       return false;
     }
   }
-  // Lvalue-to-rvalue conversion (C++11 4.1):
-  //   A glvalue (3.10) of a non-function, non-array type T can
-  //   be converted to a prvalue.
-  bool argIsLValue = From->isGLValue();
-  if (argIsLValue && !FromType->canDecayToPointerType() &&
-      S.Context.getCanonicalType(FromType) != S.Context.OverloadTy) {
-    SCS.First = ICK_Lvalue_To_Rvalue;
-
-    // C11 6.3.2.1p2:
-    //   ... if the lvalue has atomic type, the value has the non-atomic version
-    //   of the type of the lvalue ...
-    if (const AtomicType *Atomic = FromType->getAs<AtomicType>())
-      FromType = Atomic->getValueType();
 
-    // If T is a non-class type, the type of the rvalue is the
-    // cv-unqualified version of T. Otherwise, the type of the rvalue
-    // is T (C++ 4.1p1). C++ can't get here with class types; in C, we
-    // just strip the qualifiers because they don't matter.
-    FromType = FromType.getUnqualifiedType();
-  } else if (S.getLangOpts().HLSL && FromType->isConstantArrayType() &&
-             ToType->isConstantArrayType()) {
+  bool argIsLValue = From->isGLValue();
+  // To handle conversion from ArrayParameterType to ConstantArrayType
+  // this block must be above the one below because Array parameters
+  // do not decay and when handling HLSLOutArgExprs and
+  // the From expression is an LValue.
+  if (S.getLangOpts().HLSL && FromType->isConstantArrayType() &&
+      ToType->isConstantArrayType()) {
     // HLSL constant array parameters do not decay, so if the argument is a
     // constant array and the parameter is an ArrayParameterType we have special
     // handling here.
     if (ToType->isArrayParameterType()) {
       FromType = S.Context.getArrayParameterType(FromType);
       SCS.First = ICK_HLSL_Array_RValue;
+    } else if (FromType->isArrayParameterType()) {
+      const ArrayParameterType *APT = cast<ArrayParameterType>(FromType);
+      FromType = APT->getConstantArrayType(S.Context);
+      SCS.First = ICK_HLSL_Array_RValue;
     } else {
       SCS.First = ICK_Identity;
     }
@@ -2273,6 +2264,25 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
 
     SCS.setAllToTypes(ToType);
     return true;
+  } else if (argIsLValue && !FromType->canDecayToPointerType() &&
+             S.Context.getCanonicalType(FromType) != S.Context.OverloadTy) {
+    // Lvalue-to-rvalue conversion (C++11 4.1):
+    //   A glvalue (3.10) of a non-function, non-array type T can
+    //   be converted to a prvalue.
+
+    SCS.First = ICK_Lvalue_To_Rvalue;
+
+    // C11 6.3.2.1p2:
+    //   ... if the lvalue has atomic type, the value has the non-atomic version
+    //   of the type of the lvalue ...
+    if (const AtomicType *Atomic = FromType->getAs<AtomicType>())
+      FromType = Atomic->getValueType();
+
+    // If T is a non-class type, the type of the rvalue is the
+    // cv-unqualified version of T. Otherwise, the type of the rvalue
+    // is T (C++ 4.1p1). C++ can't get here with class types; in C, we
+    // just strip the qualifiers because they don't matter.
+    FromType = FromType.getUnqualifiedType();
   } else if (FromType->isArrayType()) {
     // Array-to-pointer conversion (C++ 4.2)
     SCS.First = ICK_Array_To_Pointer;
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index f32edc5ac0644..5fb936297aa54 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -5681,6 +5681,9 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
   assert(!T.isNull() && "T must not be null at the end of this function");
   if (!AreDeclaratorChunksValid)
     return Context.getTrivialTypeSourceInfo(T);
+
+  if (state.didParseHLSLParamMod() && !T->isConstantArrayType())
+    T = S.HLSL().getInoutParameterType(T);
   return GetTypeSourceInfoForDeclarator(state, T, TInfo);
 }
 
@@ -8634,7 +8637,6 @@ static void HandleHLSLParamModifierAttr(TypeProcessingState &State,
     return;
   if (Attr.getSemanticSpelling() == HLSLParamModifierAttr::Keyword_inout ||
       Attr.getSemanticSpelling() == HLSLParamModifierAttr::Keyword_out) {
-    CurType = S.HLSL().getInoutParameterType(CurType);
     State.setParsedHLSLParamMod(true);
   }
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocSizeofChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocSizeofChecker.cpp
index 9e81a6bd19fc5..df23735e4668e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocSizeofChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocSizeofChecker.cpp
@@ -211,9 +211,9 @@ class MallocSizeofChecker : public Checker<check::ASTCodeBody> {
           continue;
 
         const TypeSourceInfo *TSI = nullptr;
-        if (CallRec.CastedExprParent.is<const VarDecl *>()) {
-          TSI = CallRec.CastedExprParent.get<const VarDecl *>()
-                    ->getTypeSourceInfo();
+        if (const auto *VD =
+                dyn_cast<const VarDecl *>(CallRec.CastedExprParent)) {
+          TSI = VD->getTypeSourceInfo();
         } else {
           TSI = CallRec.ExplicitCastType;
         }
diff --git a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
index b0563b6c070f1..827c04143e658 100644
--- a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
@@ -196,13 +196,13 @@ const PointerToMemberData *BasicValueFactory::accumCXXBase(
   const NamedDecl *ND = nullptr;
   llvm::ImmutableList<const CXXBaseSpecifier *> BaseSpecList;
 
-  if (PTMDT.isNull() || PTMDT.is<const NamedDecl *>()) {
-    if (PTMDT.is<const NamedDecl *>())
-      ND = PTMDT.get<const NamedDecl *>();
+  if (PTMDT.isNull() || isa<const NamedDecl *>(PTMDT)) {
+    if (const auto *NDP = dyn_cast_if_present<const NamedDecl *>(PTMDT))
+      ND = NDP;
 
     BaseSpecList = CXXBaseListFactory.getEmptyList();
   } else {
-    const PointerToMemberData *PTMD = PTMDT.get<const PointerToMemberData *>();
+    const auto *PTMD = cast<const PointerToMemberData *>(PTMDT);
     ND = PTMD->getDeclaratorDecl();
 
     BaseSpecList = PTMD->getCXXBaseList();
diff --git a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
index 1e0cc2eea9ed8..c4af02f21f494 100644
--- a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp
@@ -211,9 +211,9 @@ void ExplodedNode::NodeGroup::replaceNode(ExplodedNode *node) {
   assert(!getFlag());
 
   GroupStorage &Storage = reinterpret_cast<GroupStorage&>(P);
-  assert(Storage.is<ExplodedNode *>());
+  assert(isa<ExplodedNode *>(Storage));
   Storage = node;
-  assert(Storage.is<ExplodedNode *>());
+  assert(isa<ExplodedNode *>(Storage));
 }
 
 void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) {
@@ -222,7 +222,7 @@ void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) {
   GroupStorage &Storage = reinterpret_cast<GroupStorage&>(P);
   if (Storage.isNull()) {
     Storage = N;
-    assert(Storage.is<ExplodedNode *>());
+    assert(isa<ExplodedNode *>(Storage));
     return;
   }
 
@@ -230,7 +230,7 @@ void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) {
 
   if (!V) {
     // Switch from single-node to multi-node representation.
-    ExplodedNode *Old = Storage.get<ExplodedNode *>();
+    auto *Old = cast<ExplodedNode *>(Storage);
 
     BumpVectorContext &Ctx = G.getNodeAllocator();
     V = new (G.getAllocator()) ExplodedNodeVector(Ctx, 4);
@@ -238,7 +238,7 @@ void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) {
 
     Storage = V;
     assert(!getFlag());
-    assert(Storage.is<ExplodedNodeVector *>());
+    assert(isa<ExplodedNodeVector *>(Storage));
   }
 
   V->push_back(N, G.getNodeAllocator());
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index ad4e43630dd44..bbf2303b9f6ef 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -1068,10 +1068,10 @@ const VarRegion *MemRegionManager::getVarRegion(const VarDecl *D,
     llvm::PointerUnion<const StackFrameContext *, const VarRegion *> V =
       getStackOrCaptureRegionForDeclContext(LC, DC, D);
 
-    if (V.is<const VarRegion*>())
-      return V.get<const VarRegion*>();
+    if (const auto *VR = dyn_cast_if_present<const VarRegion *>(V))
+      return VR;
 
-    const auto *STC = V.get<const StackFrameContext *>();
+    const auto *STC = cast<const StackFrameContext *>(V);
 
     if (!STC) {
       // FIXME: Assign a more sensible memory space to static locals
diff --git a/clang/lib/StaticAnalyzer/Core/SVals.cpp b/clang/lib/StaticAnalyzer/Core/SVals.cpp
index 84e7e033404c0..d009552965eca 100644
--- a/clang/lib/StaticAnalyzer/Core/SVals.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SVals.cpp
@@ -205,10 +205,10 @@ const NamedDecl *nonloc::PointerToMember::getDecl() const {
     return nullptr;
 
   const NamedDecl *ND = nullptr;
-  if (PTMD.is<const NamedDecl *>())
-    ND = PTMD.get<const NamedDecl *>();
+  if (const auto *NDP = dyn_cast<const NamedDecl *>(PTMD))
+    ND = NDP;
   else
-    ND = PTMD.get<const PointerToMemberData *>()->getDeclaratorDecl();
+    ND = cast<const PointerToMemberData *>(PTMD)->getDeclaratorDecl();
 
   return ND;
 }
@@ -227,16 +227,16 @@ nonloc::CompoundVal::iterator nonloc::CompoundVal::end() const {
 
 nonloc::PointerToMember::iterator nonloc::PointerToMember::begin() const {
   const PTMDataType PTMD = getPTMData();
-  if (PTMD.is<const NamedDecl *>())
+  if (isa<const NamedDecl *>(PTMD))
     return {};
-  return PTMD.get<const PointerToMemberData *>()->begin();
+  return cast<const PointerToMemberData *>(PTMD)->begin();
 }
 
 nonloc::PointerToMember::iterator nonloc::PointerToMember::end() const {
   const PTMDataType PTMD = getPTMData();
-  if (PTMD.is<const NamedDecl *>())
+  if (isa<const NamedDecl *>(PTMD))
     return {};
-  return PTMD.get<const PointerToMemberData *>()->end();
+  return cast<const PointerToMemberData *>(PTMD)->end();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 9a6a31b5ec4c0..f70b77fe74636 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -7,6 +7,14 @@
 // RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -fexperimental-new-constant-interpreter %s -verify=expected,both
 // RUN: %clang_cc1 -triple avr -std=c++20 -Wno-string-plus-int -verify=ref,both %s -Wno-constant-evaluated
 
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define LITTLE_END 1
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define LITTLE_END 0
+#else
+#error "huh?"
+#endif
+
 
 namespace strcmp {
   constexpr char kFoobar[6] = {'f','o','o','b','a','r'};
@@ -1128,6 +1136,10 @@ namespace ElementwisePopcount {
   static_assert(__builtin_elementwise_popcount(0L) == 0);
   static_assert(__builtin_elementwise_popcount(0xF0F0L) == 8);
   static_assert(__builtin_elementwise_popcount(~0LL) == 8 * sizeof(long long));
+
+#if __INT_WIDTH__ == 32
+  static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_popcount((vector4char){1, 2, 3, 4})) == (LITTLE_END ? 0x01020101 : 0x01010201));
+#endif
 }
 
 namespace BuiltinMemcpy {
@@ -1138,4 +1150,23 @@ namespace BuiltinMemcpy {
     return b;
   }
   static_assert(simple() == 12);
+
+
+  extern struct Incomplete incomplete;
+  constexpr struct Incomplete *null_incomplete = 0;
+  static_assert(__builtin_memcpy(null_incomplete, null_incomplete, sizeof(wchar_t))); // both-error {{not an integral constant expression}} \
+                                                                                      // both-note {{source of 'memcpy' is nullptr}}
+
+  wchar_t global;
+  constexpr wchar_t *null = 0;
+  static_assert(__builtin_memcpy(&global, null, sizeof(wchar_t))); // both-error {{not an integral constant expression}} \
+                                                                   // both-note {{source of 'memcpy' is nullptr}}
+
+  constexpr int simpleMove() {
+    int a = 12;
+    int b = 0;
+    __builtin_memmove(&b, &a, sizeof(a));
+    return b;
+  }
+  static_assert(simpleMove() == 12);
 }
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 13d6c4feb3500..662823c49cd4a 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -980,6 +980,8 @@ namespace DiscardExprs {
     __uuidof(number); // both-error {{cannot call operator __uuidof on a type with no GUID}}
 
     requires{false;};
+    constexpr int *p = nullptr;
+    p - p;
 
     return 0;
   }
diff --git a/clang/test/AST/ByteCode/vectors.cpp b/clang/test/AST/ByteCode/vectors.cpp
index 08e2ca2adbf5c..a04b678a623a1 100644
--- a/clang/test/AST/ByteCode/vectors.cpp
+++ b/clang/test/AST/ByteCode/vectors.cpp
@@ -37,6 +37,7 @@ static_assert(arr4[1][0] == 0, "");
 static_assert(arr4[1][0] == 0, "");
 static_assert(arr4[1][0] == 0, "");
 
+constexpr VI4 B = __extension__(A);
 
 /// From constant-expression-cxx11.cpp
 namespace Vector {
diff --git a/clang/test/AST/HLSL/ArrayOutArgExpr.hlsl b/clang/test/AST/HLSL/ArrayOutArgExpr.hlsl
new file mode 100644
index 0000000000000..10825bf0f93bc
--- /dev/null
+++ b/clang/test/AST/HLSL/ArrayOutArgExpr.hlsl
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
+
+// CHECK-LABEL: increment
+void increment(inout int Arr[2]) {
+  for (int I = 0; I < 2; I++)
+    Arr[0] += 2;
+}
+
+// CHECK-LABEL: call
+// CHECK: CallExpr 0x{{.*}} {{.*}} 'void'
+// CHECK: ImplicitCastExpr 0x{{.*}} {{.*}} 'void (*)(inout int[2])' <FunctionToPointerDecay>
+// CHECK: DeclRefExpr 0x{{.*}} {{.*}} 'void (inout int[2])' lvalue Function 0x{{.*}} 'increment' 'void (inout int[2])'
+// CHECK: HLSLOutArgExpr 0x{{.*}} {{.*}} 'int[2]' lvalue inout
+// CHECK: OpaqueValueExpr [[A:0x.*]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B:0x.*]] {{.*}} 'int[2]' lvalue Var [[E:0x.*]] 'A' 'int[2]'
+// CHECK: OpaqueValueExpr [[C:0x.*]] {{.*}} 'int[2]' lvalue
+// CHECK: ImplicitCastExpr [[D:0x.*]] {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B]] {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+// CHECK: BinaryOperator 0x{{.*}} {{.*}} 'int[2]' lvalue '='
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr 0x{{.*}} {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+// CHECK: ImplicitCastExpr 0x{{.*}} {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[C]] {{.*}} 'int[2]' lvalue
+// CHECK: ImplicitCastExpr [[D]] {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B]] {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+export int call() {
+  int A[2] = { 0, 1 };
+  increment(A);
+  return A[0];
+}
+
+// CHECK-LABEL: fn2
+void fn2(out int Arr[2]) {
+  Arr[0] += 5;
+  Arr[1] += 6;
+}
+
+// CHECK-LABEL: call2
+// CHECK: CallExpr 0x{{.*}} {{.*}} 'void'
+// CHECK: ImplicitCastExpr 0x{{.*}} {{.*}} 'void (*)(out int[2])' <FunctionToPointerDecay>
+// CHECK: DeclRefExpr 0x{{.*}} {{.*}} 'void (out int[2])' lvalue Function 0x{{.*}} 'fn2' 'void (out int[2])'
+// CHECK: HLSLOutArgExpr 0x{{.*}} {{.*}} 'int[2]' lvalue out
+// CHECK: OpaqueValueExpr [[A:0x.*]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B:0x.*]] {{.*}} 'int[2]' lvalue Var [[E:0x.*]] 'A' 'int[2]'
+// CHECK: OpaqueValueExpr [[C:0x.*]] {{.*}} 'int[2]' lvalue
+// CHECK: ImplicitCastExpr [[D:0x.*]] {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B]] {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+// CHECK: BinaryOperator 0x{{.*}} {{.*}} 'int[2]' lvalue '='
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B]] {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+// CHECK: ImplicitCastExpr 0x{{.*}} {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[C]] {{.*}} 'int[2]' lvalue
+// CHECK: ImplicitCastExpr [[D]] {{.*}} 'int[2]' <HLSLArrayRValue>
+// CHECK: OpaqueValueExpr [[A]] {{.*}} 'int[2]' lvalue
+// CHECK: DeclRefExpr [[B]] {{.*}} 'int[2]' lvalue Var [[E]] 'A' 'int[2]'
+export int call2() {
+  int A[2] = { 0, 1 };
+  fn2(A);
+  return 1;
+}
diff --git a/clang/test/AST/ast-print-openacc-combined-construct.cpp b/clang/test/AST/ast-print-openacc-combined-construct.cpp
index d16e446706807..435c770c7457d 100644
--- a/clang/test/AST/ast-print-openacc-combined-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-combined-construct.cpp
@@ -224,4 +224,22 @@ void foo() {
       for(int i = 0;i<5;++i)
         for(int i = 0;i<5;++i);
 
+// CHECK: #pragma acc parallel loop num_gangs(i, (int)array[2])
+// CHECK-NEXT: for (int i = 0; i < 5; ++i)
+// CHECK-NEXT: ;
+#pragma acc parallel loop num_gangs(i, (int)array[2])
+  for(int i = 0;i<5;++i);
+
+// CHECK: #pragma acc parallel loop num_workers(i)
+// CHECK-NEXT: for (int i = 0; i < 5; ++i)
+// CHECK-NEXT: ;
+#pragma acc parallel loop num_workers(i)
+  for(int i = 0;i<5;++i);
+
+// CHECK: #pragma acc parallel loop vector_length((int)array[1])
+// CHECK-NEXT: for (int i = 0; i < 5; ++i)
+// CHECK-NEXT: ;
+#pragma acc parallel loop vector_length((int)array[1])
+  for(int i = 0;i<5;++i);
+
 }
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index ed0c7159dfc88..91a76fd2adbb6 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -1178,17 +1178,61 @@ namespace cwg590 { // cwg590: yes
   template<typename T> typename A<T>::B::C A<T>::B::C::f(A<T>::B::C) {}
 }
 
-namespace cwg591 { // cwg591: no
+namespace cwg591 { // cwg591: 20
   template<typename T> struct A {
     typedef int M;
     struct B {
       typedef void M;
       struct C;
+      struct D;
+    };
+  };
+
+  template<typename T> struct G {
+    struct B {
+      typedef int M;
+      struct C {
+        typedef void M;
+        struct D;
+      };
+    };
+  };
+
+  template<typename T> struct H {
+    template<typename U> struct B {
+      typedef int M;
+      template<typename F> struct C {
+        typedef void M;
+        struct D;
+        struct P;
+      };
     };
   };
 
   template<typename T> struct A<T>::B::C : A<T> {
-    // FIXME: Should find member of non-dependent base class A<T>.
+    M m;
+  };
+
+  template<typename T> struct G<T>::B::C::D : B {
+    M m;
+  };
+
+  template<typename T>
+  template<typename U>
+  template<typename F>
+  struct H<T>::B<U>::C<F>::D : B<U> {
+    M m;
+  };
+
+  template<typename T> struct A<T>::B::D : A<T*> {
+    M m;
+    // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
+  };
+
+  template<typename T>
+  template<typename U>
+  template<typename F>
+  struct H<T>::B<U>::C<F>::P : B<F> {
     M m;
     // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
   };
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 2033a8b4c335f..c92aad633082f 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -1,80 +1,115 @@
 // RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu i686 -tune-cpu i686 -emit-llvm %s -o - | FileCheck %s
 
-int baz(int a) { return 4; }
+// CHECK: define {{.*}}@f_default({{.*}} [[f_default:#[0-9]+]]
+// CHECK: define {{.*}}@f_avx_sse4_2_ivybridge({{.*}} [[f_avx_sse4_2_ivybridge:#[0-9]+]]
+// CHECK: define {{.*}}@f_fpmath_387({{.*}} [[f_default]]
+// CHECK: define {{.*}}@f_no_sse2({{.*}} [[f_no_sse2:#[0-9]+]]
+// CHECK: define {{.*}}@f_sse4({{.*}} [[f_sse4:#[0-9]+]]
+// CHECK: define {{.*}}@f_no_sse4({{.*}} [[f_no_sse4:#[0-9]+]]
+// CHECK: define {{.*}}@f_default2({{.*}} [[f_default]]
+// CHECK: define {{.*}}@f_avx_sse4_2_ivybridge_2({{.*}} [[f_avx_sse4_2_ivybridge]]
+// CHECK: define {{.*}}@f_no_aes_ivybridge({{.*}} [[f_no_aes_ivybridge:#[0-9]+]]
+// CHECK: define {{.*}}@f_no_mmx({{.*}} [[f_no_mmx:#[0-9]+]]
+// CHECK: define {{.*}}@f_lakemont_mmx({{.*}} [[f_lakemont_mmx:#[0-9]+]]
+// CHECK: define {{.*}}@f_use_before_def({{.*}} [[f_lakemont_mmx]]
+// CHECK: define {{.*}}@f_tune_sandybridge({{.*}} [[f_tune_sandybridge:#[0-9]+]]
+// CHECK: define {{.*}}@f_x86_64_v2({{.*}} [[f_x86_64_v2:#[0-9]+]]
+// CHECK: define {{.*}}@f_x86_64_v3({{.*}} [[f_x86_64_v3:#[0-9]+]]
+// CHECK: define {{.*}}@f_x86_64_v4({{.*}} [[f_x86_64_v4:#[0-9]+]]
+// CHECK: define {{.*}}@f_avx10_1_256{{.*}} [[f_avx10_1_256:#[0-9]+]]
+// CHECK: define {{.*}}@f_avx10_1_512{{.*}} [[f_avx10_1_512:#[0-9]+]]
+// CHECK: define {{.*}}@f_prefer_256_bit({{.*}} [[f_prefer_256_bit:#[0-9]+]]
+// CHECK: define {{.*}}@f_no_prefer_256_bit({{.*}} [[f_no_prefer_256_bit:#[0-9]+]]
+
+// CHECK: [[f_default]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
+void f_default(void) {}
+
+// CHECK: [[f_avx_sse4_2_ivybridge]] = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+__attribute__((target("avx,sse4.2,arch=ivybridge")))
+void f_avx_sse4_2_ivybridge(void) {}
+
+// We're currently ignoring the fpmath attribute. So checked above that
+// attributes are identical to f_default.
+__attribute__((target("fpmath=387")))
+void f_fpmath_387(void) {}
 
-int __attribute__((target("avx,sse4.2,arch=ivybridge"))) foo(int a) { return 4; }
-
-int __attribute__((target("fpmath=387"))) koala(int a) { return 4; }
-
-int __attribute__((target("no-sse2"))) echidna(int a) { return 4; }
-
-int __attribute__((target("sse4"))) panda(int a) { return 4; }
-int __attribute__((target("no-sse4"))) narwhal(int a) { return 4; }
+// CHECK-NOT: tune-cpu
+// CHECK: [[f_no_sse2]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+__attribute__((target("no-sse2")))
+void f_no_sse2(void) {}
+
+// CHECK: [[f_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
+__attribute__((target("sse4")))
+void f_sse4(void) {}
+
+// CHECK: [[f_no_sse4]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+__attribute__((target("no-sse4")))
+void f_no_sse4(void) {}
+
+// checked above that attributes are identical to f_default
+void f_default2(void) {
+  f_avx_sse4_2_ivybridge();
+  return f_default();
+}
 
-int bar(int a) { return baz(a) + foo(a); }
+// Checked above to have same attributes as f_avx_sse4_2_ivybridge
+__attribute__((target("avx,      sse4.2,      arch=   ivybridge")))
+void f_avx_sse4_2_ivybridge_2(void) {}
 
-int __attribute__((target("avx,      sse4.2,      arch=   ivybridge"))) qux(int a) { return 4; }
-int __attribute__((target("no-aes, arch=ivybridge"))) qax(int a) { return 4; }
+// CHECK: [[f_no_aes_ivybridge]] = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-amx-avx512,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-vaes"
+__attribute__((target("no-aes, arch=ivybridge")))
+void f_no_aes_ivybridge(void) {}
 
-int __attribute__((target("no-mmx"))) qq(int a) { return 40; }
+// CHECK-NOT: tune-cpu
+// CHECK: [[f_no_mmx]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-mmx"
+__attribute__((target("no-mmx")))
+void f_no_mmx(void) {}
 
-int __attribute__((target("arch=lakemont,mmx"))) lake(int a) { return 4; }
+// CHECK: [[f_lakemont_mmx]] = {{.*}}"target-cpu"="lakemont" "target-features"="+cx8,+mmx"
+// Adding the attribute to a definition does update it in IR.
+__attribute__((target("arch=lakemont,mmx")))
+void f_lakemont_mmx(void) {}
 
-int use_before_def(void);
-int useage(void){
-  return use_before_def();
+void f_use_before_def(void);
+void usage(void){
+  f_use_before_def();
 }
 
-// Adding the attribute to a definition does update it in IR.
-int __attribute__((target("arch=lakemont,mmx"))) use_before_def(void) {
-  return 5;
-}
+// Checked above to have same attributes as f_lakemont_mmx
+__attribute__((target("arch=lakemont,mmx")))
+void f_use_before_def(void) {}
 
-int __attribute__((target("tune=sandybridge"))) walrus(int a) { return 4; }
-
-void __attribute__((target("arch=x86-64-v2"))) x86_64_v2(void) {}
-void __attribute__((target("arch=x86-64-v3"))) x86_64_v3(void) {}
-void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
-
-void __attribute__((target("avx10.1-256"))) avx10_1_256(void) {}
-void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {}
-
-// Check that we emit the additional subtarget and cpu features for foo and not for baz or bar.
-// CHECK: baz{{.*}} #0
-// CHECK: foo{{.*}} #1
-// We're currently ignoring the fpmath attribute so koala should be identical to baz and bar.
-// CHECK: koala{{.*}} #0
-// CHECK: echidna{{.*}} #2
-// CHECK: panda{{.*}} #3
-// CHECK: narwhal{{.*}} #4
-// CHECK: bar{{.*}} #0
-// CHECK: qux{{.*}} #1
-// CHECK: qax{{.*}} #5
-// CHECK: qq{{.*}} #6
-// CHECK: lake{{.*}} #7
-// CHECK: use_before_def{{.*}} #7
-// CHECK: walrus{{.*}} #8
-// CHECK: avx10_1_256{{.*}} #12
-// CHECK: avx10_1_512{{.*}} #13
-// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
-// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
-// CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
-// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-amx-avx512,-avx,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
-// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-amx-avx512,-avx10.1-256,-avx10.1-512,-avx10.2-256,-avx10.2-512,-vaes"
-// CHECK-NOT: tune-cpu
-// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-mmx"
-// CHECK: #7 = {{.*}}"target-cpu"="lakemont" "target-features"="+cx8,+mmx"
-// CHECK-NOT: tune-cpu
-// CHECK: #8 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="sandybridge"
+// CHECK: [[f_tune_sandybridge]] = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="sandybridge"
+__attribute__((target("tune=sandybridge")))
+void f_tune_sandybridge(void) {}
 
-// CHECK: "target-cpu"="x86-64-v2"
+// CHECK: [[f_x86_64_v2]] ={{.*}}"target-cpu"="x86-64-v2"
 // CHECK-SAME: "target-features"="+cmov,+crc32,+cx16,+cx8,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
-// CHECK: "target-cpu"="x86-64-v3"
+__attribute__((target("arch=x86-64-v2")))
+void f_x86_64_v2(void) {}
+
+// CHECK: [[f_x86_64_v3]] = {{.*}}"target-cpu"="x86-64-v3"
 // CHECK-SAME: "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
-// CHECK: "target-cpu"="x86-64-v4"
+__attribute__((target("arch=x86-64-v3")))
+void f_x86_64_v3(void) {}
+
+// CHECK: [[f_x86_64_v4]] = {{.*}}"target-cpu"="x86-64-v4"
 // CHECK-SAME: "target-features"="+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fxsr,+lzcnt,+mmx,+movbe,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave"
+__attribute__((target("arch=x86-64-v4")))
+void f_x86_64_v4(void) {}
+
+// CHECK: [[f_avx10_1_256]] = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,-amx-avx512,-avx10.1-512,-avx10.2-512,-evex512"
+__attribute__((target("avx10.1-256")))
+void f_avx10_1_256(void) {}
+
+// CHECK: [[f_avx10_1_512]] = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave"
+__attribute__((target("avx10.1-512")))
+void f_avx10_1_512(void) {}
+
+// CHECK: [[f_prefer_256_bit]] = {{.*}}"target-features"="{{.*}}+prefer-256-bit
+__attribute__((target("prefer-256-bit")))
+void f_prefer_256_bit(void) {}
 
-// CHECK: #12 = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave,-amx-avx512,-avx10.1-512,-avx10.2-512,-evex512"
-// CHECK: #13 = {{.*}}"target-cpu"="i686" "target-features"="+aes,+avx,+avx10.1-256,+avx10.1-512,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512fp16,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vpopcntdq,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+vaes,+vpclmulqdq,+x87,+xsave"
+// CHECK: [[f_no_prefer_256_bit]] = {{.*}}"target-features"="{{.*}}-prefer-256-bit
+__attribute__((target("no-prefer-256-bit")))
+void f_no_prefer_256_bit(void) {}
\ No newline at end of file
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index f1f34432ca0ea..82f82dd1ed794 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -443,7 +443,7 @@ void test_builtin_elementwise_bitreverse(si8 vi1, si8 vi2,
   // CHECK-NEXT: call i32 @llvm.bitreverse.i32(i32 [[IA1]])
   b = __builtin_elementwise_bitreverse(int_as_one);
 
-  // CHECK:   call i32 @llvm.bitreverse.i32(i32 -10)
+  // CHECK:      store i32 1879048191, ptr @b, align 4
   b = __builtin_elementwise_bitreverse(-10);
 
   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
diff --git a/clang/test/CodeGenCXX/ms-uneval-context-crash.cpp b/clang/test/CodeGenCXX/ms-uneval-context-crash.cpp
new file mode 100644
index 0000000000000..b2f7e58381da8
--- /dev/null
+++ b/clang/test/CodeGenCXX/ms-uneval-context-crash.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++20 -fms-compatibility -fms-compatibility-version=19.33 -emit-llvm %s -o - -triple=x86_64-windows-msvc | FileCheck %s
+
+template <typename T>
+concept C = requires
+{
+    { T::test([](){}) };
+};
+
+template<typename T>
+struct Widget {};
+
+template <C T>
+struct Widget<T> {};
+
+struct Baz
+{
+    template<typename F>
+    static constexpr decltype(auto) test(F&&) {}
+};
+
+void test()
+{
+    Widget<Baz> w;
+}
+// CHECK: @"?test@@YAXXZ"
diff --git a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
index a0dfe26e5d147..e2ff2de68ed99 100644
--- a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
@@ -100,18 +100,16 @@ void arr_assign6() {
 }
 
 // CHECK-LABEL: define void {{.*}}arr_assign7
-// CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
-// CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
-// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK: [[Arr:%.*]] = alloca [2 x [2 x i32]], align 4
+// CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr3]], ptr align 4 {{@.*}}, i32 16, i1 false)
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr4]], ptr align 4 {{@.*}}, i32 16, i1 false)
-// CHECK-NEXT: store i32 6, ptr [[Tmp]], align 4
-// CHECK-NEXT: [[AIE:%.*]] = getelementptr inbounds i32, ptr [[Tmp]], i32 1
-// CHECK-NEXT: store i32 6, ptr [[AIE]], align 4
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr3]], ptr align 4 [[Arr4]], i32 16, i1 false)
-// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[Arr3]], i32 0, i32 0
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Idx]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr]], ptr align 4 {{@.*}}, i32 16, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr2]], ptr align 4 {{@.*}}, i32 16, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr]], ptr align 4 [[Arr2]], i32 16, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[Arr]], i32 0, i32 0
+// CHECK-NEXT: store i32 6, ptr [[Idx]], align 4
+// CHECK-NEXT: [[Idx2:%.*]] = getelementptr inbounds i32, ptr %arrayidx, i32 1
+// CHECK-NEXT: store i32 6, ptr [[Idx2]], align 4
 // CHECK-NEXT: ret void
 void arr_assign7() {
   int Arr[2][2] = {{0, 1}, {2, 3}};
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
new file mode 100644
index 0000000000000..eb7d755bca61d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: increment
+void increment(inout int Arr[2]) {
+  for (int I = 0; I < 2; I++)
+    Arr[0] += 2;
+}
+
+// CHECK-LABEL: arrayCall
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
+// CHECK-NEXT: ret i32 [[B]]
+export int arrayCall() {
+  int A[2] = { 0, 1 };
+  increment(A);
+  return A[0];
+}
+
+// CHECK-LABEL: fn2
+void fn2(out int Arr[2]) {
+  Arr[0] += 5;
+  Arr[1] += 6;
+}
+
+// CHECK-LABEL: arrayCall2
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
+// CHECK-NEXT: ret i32 [[B]]
+export int arrayCall2() {
+  int A[2] = { 0, 1 };
+  fn2(A);
+  return A[0];
+}
+
+// CHECK-LABEL: nestedCall
+void nestedCall(inout int Arr[2], uint index) {
+  if (index < 2) {
+    Arr[index] += 2;
+    nestedCall(Arr, index+1);
+  }
+}
+
+// CHECK-LABEL: arrayCall3
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 1
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
+// CHECK-NEXt: ret i32 [[B]]
+export int arrayCall3() {
+  int A[2] = { 0, 1 };
+  nestedCall(A, 0);
+  return A[1];
+}
+
+// CHECK-LABEL: outerCall
+// CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 %{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: ret void
+void outerCall(inout int Arr[2]) {
+  increment(Arr);
+}
+
+// CHECK-LABEL: arrayCall4
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
+// CHECK-NEXT: ret i32 [[B]]
+export int arrayCall4() {
+  int A[2] = { 0, 1 };
+  outerCall(A);
+  return A[0];
+}
+
+// CHECK-LABEL: fn3
+void fn3(int Arr[2]) {}
+
+// CHECK-LABEL: outerCall2
+// CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 {{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: ret void
+void outerCall2(inout int Arr[2]) {
+  fn3(Arr);
+}
+
+// CHECK-LABEL: arrayCall5
+// CHECK: [[A:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
+// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
+// CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
+// CHECK-NEXT: ret i32 [[B]]
+export int arrayCall5() {
+  int A[2] = { 0, 1 };
+  outerCall2(A);
+  return A[0];
+}
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
new file mode 100644
index 0000000000000..9d95d54852c0b
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   -DTARGET=dx -DFNATTRS=noundef -check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef" -check-prefixes=CHECK,CHECK-SPIRV
+
+// CHECK-DXIL: define void @
+// CHECK-SPIRV: define spir_func void @
+void test_GroupMemoryBarrierWithGroupSync() {
+// CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
+// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
+  GroupMemoryBarrierWithGroupSync();
+}
+
+// CHECK: declare void @llvm.[[TARGET]].group.memory.barrier.with.group.sync() #[[ATTRS:[0-9]+]]
+// CHECK-NOT: attributes #[[ATTRS]] = {{.+}}memory(none){{.+}}
+// CHECK: attributes #[[ATTRS]] = {
diff --git a/clang/test/Lexer/gnu-flags.c b/clang/test/Lexer/gnu-flags.c
index 6c7bf9405ddf0..30cfcf710f346 100644
--- a/clang/test/Lexer/gnu-flags.c
+++ b/clang/test/Lexer/gnu-flags.c
@@ -16,6 +16,8 @@
 
 
 #if ALL || ZEROARGS
+// expected-warning@+9 {{passing no argument for the '...' parameter of a variadic macro is a C23 extension}}
+// expected-note@+4 {{macro 'efoo' defined here}}
 // expected-warning@+3 {{token pasting of ',' and __VA_ARGS__ is a GNU extension}}
 #endif
 
diff --git a/clang/test/Preprocessor/macro_fn.c b/clang/test/Preprocessor/macro_fn.c
index 81d8363214078..2e72bd272084e 100644
--- a/clang/test/Preprocessor/macro_fn.c
+++ b/clang/test/Preprocessor/macro_fn.c
@@ -1,11 +1,17 @@
 /* RUN: %clang_cc1 %s -Eonly -std=c89 -pedantic -verify
 */
+// RUN: %clang_cc1 %s -Eonly -std=c89 -pedantic -Wno-gnu-zero-variadic-macro-arguments -verify -DOMIT_VARIADIC_MACRO_ARGS -DVARIADIC_MACRO_ARGS_REMOVE_COMMA
+// RUN: %clang_cc1 %s -Eonly -std=c89 -pedantic -Wno-variadic-macro-arguments-omitted -verify -DOMIT_VARIADIC_MACRO_ARGS
 /* PR3937 */
 #define zero() 0 /* expected-note 2 {{defined here}} */
 #define one(x) 0 /* expected-note 2 {{defined here}} */
 #define two(x, y) 0 /* expected-note 4 {{defined here}} */
 #define zero_dot(...) 0   /* expected-warning {{variadic macros are a C99 feature}} */
-#define one_dot(x, ...) 0 /* expected-warning {{variadic macros are a C99 feature}} expected-note 2{{macro 'one_dot' defined here}} */
+#define one_dot(x, ...) 0 /* expected-warning {{variadic macros are a C99 feature}} */
+
+#ifndef OMIT_VARIADIC_MACRO_ARGS
+/* expected-note@-3 2{{macro 'one_dot' defined here}} */
+#endif
 
 zero()
 zero(1);          /* expected-error {{too many arguments provided to function-like macro invocation}} */
@@ -37,16 +43,24 @@ e(x)
 e()
 
 zero_dot()
-one_dot(x)  /* empty ... argument: expected-warning {{passing no argument for the '...' parameter of a variadic macro is a C23 extension}}  */
-one_dot()   /* empty first argument, elided ...: expected-warning {{passing no argument for the '...' parameter of a variadic macro is a C23 extension}} */
+one_dot(x)  /* empty ... argument */
+one_dot()   /* empty first argument, elided ... */
 
+#ifndef OMIT_VARIADIC_MACRO_ARGS
+/* expected-warning@-4 {{passing no argument for the '...' parameter of a variadic macro is a C23 extension}} */
+/* expected-warning@-4 {{passing no argument for the '...' parameter of a variadic macro is a C23 extension}} */
+#endif
 
 /* Crash with function-like macro test at end of directive. */
 #define E() (i == 0)
 #if E
 #endif
 
-
 #define NSAssert(condition, desc, ...) /* expected-warning {{variadic macros are a C99 feature}} */ \
-    SomeComplicatedStuff((desc), ##__VA_ARGS__) /* expected-warning {{token pasting of ',' and __VA_ARGS__ is a GNU extension}} */
+    SomeComplicatedStuff((desc), ##__VA_ARGS__)
+
+#ifndef VARIADIC_MACRO_ARGS_REMOVE_COMMA
+/* expected-warning@-3 {{token pasting of ',' and __VA_ARGS__ is a GNU extension}} */
+#endif
+
 NSAssert(somecond, somedesc)
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index 772a682141ce4..45c729f76418d 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -817,3 +817,8 @@ static_assert(__builtin_elementwise_popcount(~0U) == 8 * sizeof(int));
 static_assert(__builtin_elementwise_popcount(0L) == 0);
 static_assert(__builtin_elementwise_popcount(0xF0F0L) == 8);
 static_assert(__builtin_elementwise_popcount(~0LL) == 8 * sizeof(long long));
+
+static_assert(__builtin_elementwise_bitreverse(0x12345678) == 0x1E6A2C48);
+static_assert(__builtin_elementwise_bitreverse(0x0123456789ABCDEFULL) == 0xF7B3D591E6A2C480);
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_bitreverse((vector4char){1, 2, 4, 8})) == (LITTLE_END ? 0x10204080 : 0x80402010));
+static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_bitreverse((vector4short){1, 2, 4, 8})) == (LITTLE_END ? 0x1000200040008000 : 0x8000400020001000));
diff --git a/clang/test/Sema/enum.c b/clang/test/Sema/enum.c
index 4f6d04ba7f918..3db301dab0a45 100644
--- a/clang/test/Sema/enum.c
+++ b/clang/test/Sema/enum.c
@@ -121,6 +121,21 @@ int NegativeShortTest[NegativeShort == -1 ? 1 : -1];
 enum Color { Red, Green, Blue }; // expected-note{{previous use is here}}
 typedef struct Color NewColor; // expected-error {{use of 'Color' with tag type that does not match previous declaration}}
 
+// Enumerations with a fixed underlying type. 
+// https://github.com/llvm/llvm-project/issues/116880
+#if __STDC_VERSION__ >= 202311L
+  static_assert(__has_feature(c_fixed_enum));
+  static_assert(__has_extension(c_fixed_enum)); // Matches behavior for c_alignas, etc
+#else
+  _Static_assert(__has_extension(c_fixed_enum), "");
+  _Static_assert(!__has_feature(c_fixed_enum), "");
+#if __STDC_VERSION__ < 201112L
+  // expected-warning@-3 {{'_Static_assert' is a C11 extension}}
+  // expected-warning@-3 {{'_Static_assert' is a C11 extension}}
+#endif
+#endif
+typedef enum : unsigned char { Pink, Black, Cyan } Color; // pre-c23-warning {{enumeration types with a fixed underlying type are a C23 extension}}
+
 // PR28903
 // In C it is valid to define tags inside enums.
 struct PR28903 {
diff --git a/clang/test/SemaCXX/lambda-capture-type-deduction.cpp b/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
index a86f301898992..b7a3d77cfc2f4 100644
--- a/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
+++ b/clang/test/SemaCXX/lambda-capture-type-deduction.cpp
@@ -298,6 +298,22 @@ void __trans_tmp_1() {
 
 }
 
+namespace GH115931 {
+
+struct Range {};
+
+template <Range>
+struct LengthPercentage {};
+
+void reflectSum() {
+  Range resultR;
+  [&] (auto) -> LengthPercentage<resultR> { 
+    return {};
+  }(0);
+}
+
+} // namespace GH115931
+
 namespace GH47400 {
 
 struct Foo {};
@@ -319,3 +335,17 @@ constexpr void foo() {
 }
 
 } // namespace GH47400
+
+namespace GH84961 {
+
+template <typename T> void g(const T &t) {}
+
+template <typename T> void f(const T &t) {
+  [t] { g(t); }();
+}
+
+void h() {
+  f(h);
+}
+
+} // namespace GH84961
diff --git a/clang/test/SemaHLSL/ArrayTemporary.hlsl b/clang/test/SemaHLSL/ArrayTemporary.hlsl
index dff9aff7d9b29..0266a198e7ec9 100644
--- a/clang/test/SemaHLSL/ArrayTemporary.hlsl
+++ b/clang/test/SemaHLSL/ArrayTemporary.hlsl
@@ -75,17 +75,17 @@ void template_fn(T Val) {}
 // CHECK: CallExpr {{.*}} 'void'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float[2])' <FunctionToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float[2])' lvalue Function {{.*}} 'template_fn' 'void (float[2])' (FunctionTemplate {{.*}} 'template_fn')
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float[2]' <LValueToRValue>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float[2]' <HLSLArrayRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float[2]' lvalue ParmVar {{.*}} 'FA2' 'float[2]'
 // CHECK-NEXT: CallExpr {{.*}} 'void'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float[4])' <FunctionToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'void (float[4])' lvalue Function {{.*}} 'template_fn' 'void (float[4])' (FunctionTemplate {{.*}} 'template_fn')
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float[4]' <LValueToRValue>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float[4]' <HLSLArrayRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float[4]' lvalue ParmVar {{.*}} 'FA4' 'float[4]'
 // CHECK-NEXT: CallExpr {{.*}} 'void'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(int[3])' <FunctionToPointerDecay>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'void (int[3])' lvalue Function {{.*}} 'template_fn' 'void (int[3])' (FunctionTemplate {{.*}} 'template_fn')
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[3]' <LValueToRValue>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int[3]' <HLSLArrayRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int[3]' lvalue ParmVar {{.*}} 'IA3' 'int[3]'
 
 void call(float FA2[2], float FA4[4], int IA3[3]) {
diff --git a/clang/test/SemaHLSL/BuiltIns/GroupMemoryBarrierWithGroupSync-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/GroupMemoryBarrierWithGroupSync-errors.hlsl
new file mode 100644
index 0000000000000..24067fbb275b7
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/GroupMemoryBarrierWithGroupSync-errors.hlsl
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+void test_too_many_arg() {
+  __builtin_hlsl_group_memory_barrier_with_group_sync(0);
+  // expected-error@-1 {{too many arguments to function call, expected 0, have 1}}
+}
diff --git a/clang/test/SemaHLSL/Language/ArrayOutputArgs-errors.hlsl b/clang/test/SemaHLSL/Language/ArrayOutputArgs-errors.hlsl
new file mode 100644
index 0000000000000..46bed0d5a7cbd
--- /dev/null
+++ b/clang/test/SemaHLSL/Language/ArrayOutputArgs-errors.hlsl
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -verify
+
+void increment(inout int Arr[2]) {
+  for (int I = 0; I < 2; I++)
+    Arr[0] += 2;
+}
+
+export int wrongSize() {
+  int A[3] = { 0, 1, 2 };
+  increment(A);
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: no known conversion from 'int[3]' to 'int[2]' for 1st argument}}
+  return A[0];
+}
+
+export int wrongSize2() {
+  int A[1] = { 0 };
+  increment(A);
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: no known conversion from 'int[1]' to 'int[2]' for 1st argument}}
+  return A[0];
+}
+
+export void tooFewArgs() {
+  increment();
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: requires single argument 'Arr', but no arguments were provided}}
+}
+
+export float wrongType() {
+  float A[2] = { 0, 1 };
+  increment(A);
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: no known conversion from 'float[2]' to 'int[2]' for 1st argument}}
+  return A[0];
+}
+
+export int wrongType2() {
+  increment(5);
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: no known conversion from 'int' to 'int[2]' for 1st argument}}
+  return 1;
+}
+
+export void tooManyArgs() {
+  int A[2] = { 0, 1 };
+  int B[2] = { 2, 3 };
+  increment(A, B);
+  // expected-error@-1 {{no matching function for call to 'increment'}}
+  // expected-note@*:* {{candidate function not viable: requires single argument 'Arr', but 2 arguments were provided}}
+}
diff --git a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
index fc5250ce548e4..a6f57a63a91dd 100644
--- a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
@@ -134,16 +134,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop auto bind(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop auto vector_length(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop auto num_gangs(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop auto num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
@@ -261,16 +255,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop bind(Var) auto
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop vector_length(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop num_gangs(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop num_workers(1) auto
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
@@ -389,16 +377,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop independent bind(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop independent vector_length(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop independent num_gangs(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop independent num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
@@ -516,16 +498,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop bind(Var) independent
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop vector_length(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop num_gangs(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop num_workers(1) independent
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
@@ -650,16 +626,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop seq bind(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop seq vector_length(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop seq num_gangs(1)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop seq num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
@@ -783,16 +753,10 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'bind' not yet implemented}}
 #pragma acc parallel loop bind(Var) seq
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'vector_length' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'vector_length' not yet implemented}}
 #pragma acc parallel loop vector_length(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_gangs' not yet implemented}}
 #pragma acc parallel loop num_gangs(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'num_workers' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented}}
 #pragma acc parallel loop num_workers(1) seq
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
diff --git a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
index a5ab39cb12c38..9a60fb4c665e5 100644
--- a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
@@ -195,7 +195,6 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial loop' directive}}
 #pragma acc serial loop device_type(*) num_gangs(1)
   for(int i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'num_workers' not yet implemented, clause ignored}}
 #pragma acc parallel loop device_type(*) num_workers(1)
   for(int i = 0; i < 5; ++i);
   // expected-error@+2{{OpenACC clause 'device_num' may not follow a 'device_type' clause in a 'serial loop' construct}}
diff --git a/clang/test/SemaOpenACC/combined-construct-num_gangs-ast.cpp b/clang/test/SemaOpenACC/combined-construct-num_gangs-ast.cpp
new file mode 100644
index 0000000000000..6e75a00943364
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-num_gangs-ast.cpp
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+int some_int();
+short some_short();
+long some_long();
+struct CorrectConvert {
+  operator int();
+} Convert;
+
+
+void NormalUses() {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel loop num_gangs(some_int(), some_long(), some_short())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: CallExpr{{.*}}'short'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'short (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short ()' lvalue Function{{.*}} 'some_short' 'short ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_gangs(some_int())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+
+template<typename T, typename U>
+void TemplUses(T t, U u) {
+  // CHECK-NEXT: FunctionTemplateDecl
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 T
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 1 U
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (T, U)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: ParmVarDecl{{.*}} u 'U'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels loop num_gangs(u)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_gangs(u, U::value)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // Check the instantiated versions of the above.
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void (CorrectConvert, HasInt)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'CorrectConvert'
+  // CHECK-NEXT: RecordType{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: CXXRecord{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: TemplateArgument type 'HasInt'
+  // CHECK-NEXT: RecordType{{.*}} 'HasInt'
+  // CHECK-NEXT: CXXRecord{{.*}} 'HasInt'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'CorrectConvert'
+  // CHECK-NEXT: ParmVarDecl{{.*}} u 'HasInt'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_gangs clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+
+  operator char();
+};
+
+void Inst() {
+  TemplUses<CorrectConvert, HasInt>({}, {});
+}
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/combined-construct-num_gangs-clause.c b/clang/test/SemaOpenACC/combined-construct-num_gangs-clause.c
new file mode 100644
index 0000000000000..bd035bd4a51a2
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-num_gangs-clause.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+short getS();
+float getF();
+void Test() {
+#pragma acc kernels loop num_gangs(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial loop' directive}}
+#pragma acc serial loop num_gangs(1)
+  for(int i = 5; i < 10;++i);
+
+#pragma acc parallel loop num_gangs(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC clause 'num_gangs' requires expression of integer type}}
+#pragma acc parallel loop num_gangs(getF())
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc kernels loop num_gangs()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel loop num_gangs()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{OpenACC 'num_gangs' clause cannot appear more than once on a 'kernels loop' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc kernels loop num_gangs(1) num_gangs(2)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{OpenACC 'num_gangs' clause cannot appear more than once on a 'parallel loop' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel loop num_gangs(1) num_gangs(2)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{too many integer expression arguments provided to OpenACC 'num_gangs' clause: 'kernels loop' directive expects maximum of 1, 2 were provided}}
+#pragma acc kernels loop num_gangs(1, getS())
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{too many integer expression arguments provided to OpenACC 'num_gangs' clause: 'parallel loop' directive expects maximum of 3, 4 were provided}}
+#pragma acc parallel loop num_gangs(getS(), 1, getS(), 1)
+  for(int i = 5; i < 10;++i);
+}
diff --git a/clang/test/SemaOpenACC/combined-construct-num_workers-ast.cpp b/clang/test/SemaOpenACC/combined-construct-num_workers-ast.cpp
new file mode 100644
index 0000000000000..8aa361c7b037c
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-num_workers-ast.cpp
@@ -0,0 +1,230 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+int some_int();
+short some_short();
+long some_long();
+enum E{};
+E some_enum();
+struct CorrectConvert {
+  operator int();
+} Convert;
+
+
+void NormalUses() {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK-NEXT: CompoundStmt
+#pragma acc parallel loop num_workers(some_int())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int ()' lvalue Function{{.*}} 'some_int' 'int ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_workers(some_short())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CallExpr{{.*}}'short'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'short (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short ()' lvalue Function{{.*}} 'some_short' 'short ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_workers(some_long())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'long ()' lvalue Function{{.*}} 'some_long' 'long ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_workers(some_enum())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CallExpr{{.*}}'E'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'E (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'E ()' lvalue Function{{.*}} 'some_enum' 'E ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_workers(Convert)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'struct CorrectConvert':'CorrectConvert' lvalue Var
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+
+template<typename T, typename U>
+void TemplUses(T t, U u) {
+  // CHECK-NEXT: FunctionTemplateDecl
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 T
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 1 U
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (T, U)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced t 'T'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel loop num_workers(t)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_workers(u)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_workers(U::value)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_workers(T{})
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'T' 'T' list
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_workers(U{})
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'U' 'U' list
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc kernels loop num_workers(typename U::IntTy{})
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'typename U::IntTy' 'typename U::IntTy' list
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop num_workers(typename U::ShortTy{})
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXUnresolvedConstructExpr{{.*}} 'typename U::ShortTy' 'typename U::ShortTy' list
+  // CHECK-NEXT: InitListExpr{{.*}} 'void'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // Check the instantiated versions of the above.
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void (CorrectConvert, HasInt)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'CorrectConvert'
+  // CHECK-NEXT: RecordType{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: CXXRecord{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: TemplateArgument type 'HasInt'
+  // CHECK-NEXT: RecordType{{.*}} 'HasInt'
+  // CHECK-NEXT: CXXRecord{{.*}} 'HasInt'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used t 'CorrectConvert'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used u 'HasInt'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'CorrectConvert' lvalue ParmVar
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator int
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}} 'CorrectConvert' lvalue
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}} 'CorrectConvert' functional cast to struct CorrectConvert <NoOp>
+  // CHECK-NEXT: InitListExpr{{.*}}'CorrectConvert'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: MaterializeTemporaryExpr{{.*}} 'HasInt' lvalue
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}} 'HasInt' functional cast to struct HasInt <NoOp>
+  // CHECK-NEXT: InitListExpr{{.*}}'HasInt'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}} 'typename HasInt::IntTy':'int' functional cast to typename struct HasInt::IntTy <NoOp>
+  // CHECK-NEXT: InitListExpr{{.*}}'typename HasInt::IntTy':'int'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: num_workers clause
+  // CHECK-NEXT: CXXFunctionalCastExpr{{.*}} 'typename HasInt::ShortTy':'short' functional cast to typename struct HasInt::ShortTy <NoOp>
+  // CHECK-NEXT: InitListExpr{{.*}}'typename HasInt::ShortTy':'short'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+
+  operator char();
+};
+
+void Inst() {
+  TemplUses<CorrectConvert, HasInt>({}, {});
+}
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/combined-construct-num_workers-clause.c b/clang/test/SemaOpenACC/combined-construct-num_workers-clause.c
new file mode 100644
index 0000000000000..a5891f071bb03
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-num_workers-clause.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+short getS();
+float getF();
+void Test() {
+#pragma acc kernels loop num_workers(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'serial loop' directive}}
+#pragma acc serial loop num_workers(1)
+  for(int i = 5; i < 10;++i);
+
+#pragma acc parallel loop num_workers(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC clause 'num_workers' requires expression of integer type}}
+#pragma acc parallel loop num_workers(getF())
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc kernels loop num_workers()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel loop num_workers()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc kernels loop num_workers(1, 2)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc parallel loop num_workers(1, 2)
+  for(int i = 5; i < 10;++i);
+}
diff --git a/clang/test/SemaOpenACC/combined-construct-vector_length-ast.cpp b/clang/test/SemaOpenACC/combined-construct-vector_length-ast.cpp
new file mode 100644
index 0000000000000..6cfc9c6b8b2c2
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-vector_length-ast.cpp
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+short some_short();
+
+struct CorrectConvert {
+  operator int();
+} Convert;
+
+
+void NormalUses() {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels loop vector_length(some_short())
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: vector_length clause
+  // CHECK-NEXT: CallExpr{{.*}}'short'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'short (*)()' <FunctionToPointerDecay>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short ()' lvalue Function{{.*}} 'some_short' 'short ()'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+template<typename T, typename U>
+void TemplUses(T t, U u) {
+  // CHECK-NEXT: FunctionTemplateDecl
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 0 T
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 1 U
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (T, U)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: ParmVarDecl{{.*}} u 'U'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc kernels loop vector_length(u)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: vector_length clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'U' lvalue ParmVar{{.*}} 'u' 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+#pragma acc parallel loop vector_length(U::value)
+  for (unsigned i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: vector_length clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'U'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // Check the instantiated versions of the above.
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void (CorrectConvert, HasInt)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'CorrectConvert'
+  // CHECK-NEXT: RecordType{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: CXXRecord{{.*}} 'CorrectConvert'
+  // CHECK-NEXT: TemplateArgument type 'HasInt'
+  // CHECK-NEXT: RecordType{{.*}} 'HasInt'
+  // CHECK-NEXT: CXXRecord{{.*}} 'HasInt'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'CorrectConvert'
+  // CHECK-NEXT: ParmVarDecl{{.*}} u 'HasInt'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} kernels loop
+  // CHECK-NEXT: vector_length clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'char' <UserDefinedConversion>
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'char'
+  // CHECK-NEXT: MemberExpr{{.*}} '<bound member function type>' .operator char
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'HasInt' lvalue ParmVar
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: vector_length clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const int' lvalue Var{{.*}} 'value' 'const int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'HasInt'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+
+struct HasInt {
+  using IntTy = int;
+  using ShortTy = short;
+  static constexpr int value = 1;
+
+  operator char();
+};
+
+void Inst() {
+  TemplUses<CorrectConvert, HasInt>({}, {});
+}
+#endif // PCH_HELPER
diff --git a/clang/test/SemaOpenACC/combined-construct-vector_length-clause.c b/clang/test/SemaOpenACC/combined-construct-vector_length-clause.c
new file mode 100644
index 0000000000000..8b6dedd9b83ba
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-vector_length-clause.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+short getS();
+float getF();
+void Test() {
+#pragma acc kernels loop vector_length(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC 'vector_length' clause is not valid on 'serial loop' directive}}
+#pragma acc serial loop vector_length(1)
+  for(int i = 5; i < 10;++i);
+
+#pragma acc parallel loop vector_length(1)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC clause 'vector_length' requires expression of integer type}}
+#pragma acc parallel loop vector_length(getF())
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc kernels loop vector_length()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel loop vector_length()
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc kernels loop vector_length(1, 2)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc parallel loop vector_length(1, 2)
+  for(int i = 5; i < 10;++i);
+}
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 5481bb6b87503..28610052b9b74 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -87,7 +87,7 @@ static cl::opt<std::string> AssumeFileName(
              "supported:\n"
              "  CSharp: .cs\n"
              "  Java: .java\n"
-             "  JavaScript: .mjs .js .ts\n"
+             "  JavaScript: .js .mjs .cjs .ts\n"
              "  Json: .json\n"
              "  Objective-C: .m .mm\n"
              "  Proto: .proto .protodevel\n"
diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format
index 6a2a2a22ec5c2..a322d4abf0ec2 100755
--- a/clang/tools/clang-format/git-clang-format
+++ b/clang/tools/clang-format/git-clang-format
@@ -94,7 +94,7 @@ def main():
       # Other languages that clang-format supports
       'proto', 'protodevel',  # Protocol Buffers
       'java',  # Java
-      'mjs', 'js',  # JavaScript
+      'js', 'mjs', 'cjs',  # JavaScript
       'ts',  # TypeScript
       'cs',  # C Sharp
       'json',  # Json
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 7aebbe7a19436..f9a911b0ae8e2 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -66,7 +66,7 @@ clang_target_link_libraries(clang-repl PRIVATE
 # start to exceed this limit, e.g. when linking for arm-linux-gnueabihf with
 # gold. This flag tells the linker to build a PLT for the full address range.
 # Linkers without this flag are assumed to support proper PLTs by default.
-set(flag_long_plt "-Wl,--long-plt")
+set(flag_long_plt "LINKER:--long-plt")
 check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
 if(HAVE_LINKER_FLAG_LONG_PLT)
   target_link_options(clang-repl PRIVATE ${flag_long_plt})
diff --git a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
index 1d18869a6b8af..75d6ca5ba17f8 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersTraversalTest.cpp
@@ -283,6 +283,12 @@ TEST(HasDeclaration, HasDeclarationOfTypeAlias) {
           hasDeclaration(typeAliasTemplateDecl()))))))));
 }
 
+TEST(HasDeclaration, HasDeclarationOfObjCInterface) {
+  EXPECT_TRUE(matchesObjC("@interface BaseClass @end void f() {BaseClass* b;}",
+                          varDecl(hasType(objcObjectPointerType(
+                              pointee(hasDeclaration(objcInterfaceDecl())))))));
+}
+
 TEST(HasUnqualifiedDesugaredType, DesugarsUsing) {
   EXPECT_TRUE(
       matches("struct A {}; using B = A; B b;",
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 186f7cc0ace54..cdedbcbaa4072 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -3599,7 +3599,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
-    <td class="none" align="center">No</td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 20054c6e85a40..80d5aaabfd8c3 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -768,7 +768,7 @@ if (CMAKE_LINKER MATCHES "link.exe$")
   # it, but CMake doesn't seem to have a way to set linker flags for
   # individual static libraries, so we enable the suppression flag for
   # the whole compiler-rt project.
-  set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /IGNORE:4221")
+  set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${CMAKE_CXX_LINKER_WRAPPER_FLAG}/IGNORE:4221")
 endif()
 
 add_subdirectory(include)
diff --git a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
index f3c8fbe2c2fec..74a5d4edcd859 100644
--- a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
@@ -136,14 +136,13 @@ function(darwin_test_archs os valid_archs)
 
   # The simple program will build for x86_64h on the simulator because it is
   # compatible with x86_64 libraries (mostly), but since x86_64h isn't actually
-  # a valid or useful architecture for the iOS simulator we should drop it.
+  # a valid or useful architecture for the simulators. We should drop it.
   if(${os} MATCHES "^(iossim|tvossim|watchossim)$")
     list(REMOVE_ITEM archs "x86_64h")
-  endif()
-
-  if(${os} MATCHES "iossim")
-    message(STATUS "Disabling i386 slice for iossim")
-    list(REMOVE_ITEM archs "i386")
+    if ("i386" IN_LIST archs)
+      list(REMOVE_ITEM archs "i386")
+      message(STATUS "Disabling i386 slice for simulator")
+    endif()
   endif()
 
   if(${os} MATCHES "^ios$")
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index c66b0465a0b54..39613da81ecb4 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -730,10 +730,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
- * version for other variants of profile. We set the 8th most significant bit 
+ * version for other variants of profile. We set the 8th most significant bit
  * (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
  * generated profile, and 0 if this is a Clang FE generated profile.
  * 1 in bit 57 indicates there are context-sensitive records in the profile.
+ * The 54th bit indicates whether to always instrument loop entry blocks.
+ * The 58th bit indicates whether to always instrument function entry blocks.
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
@@ -742,6 +744,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define VARIANT_MASKS_ALL 0xffffffff00000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
+#define VARIANT_MASK_INSTR_LOOP_ENTRIES (0x1ULL << 55)
 #define VARIANT_MASK_IR_PROF (0x1ULL << 56)
 #define VARIANT_MASK_CSIR_PROF (0x1ULL << 57)
 #define VARIANT_MASK_INSTR_ENTRY (0x1ULL << 58)
diff --git a/compiler-rt/lib/asan/CMakeLists.txt b/compiler-rt/lib/asan/CMakeLists.txt
index fb3d74283a61e..5ec995ae159b7 100644
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@@ -141,7 +141,7 @@ append_list_if(COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC
 # LLVM turns /OPT:ICF back on when LLVM_ENABLE_PDBs is set
 # we _REALLY_ need to turn it back off for ASAN, because the way
 # asan emulates weak functions from DLLs requires NOICF
-append_list_if(MSVC "/DEBUG;/OPT:NOICF" ASAN_DYNAMIC_LINK_FLAGS)
+append_list_if(MSVC "LINKER:/DEBUG;LINKER:/OPT:NOICF" ASAN_DYNAMIC_LINK_FLAGS)
 
 set(ASAN_DYNAMIC_LIBS
   ${COMPILER_RT_UNWINDER_LINK_LIBS}
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index f9b95d2faf06a..606571d527501 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -485,7 +485,7 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
 
     // Gracemont:
     case 0xbe:
-      CPU = "gracement";
+      CPU = "gracemont";
       *Type = INTEL_COREI7;
       *Subtype = INTEL_COREI7_ALDERLAKE;
       break;
diff --git a/flang/include/flang/Lower/ConvertCall.h b/flang/include/flang/Lower/ConvertCall.h
index bc082907e6176..f1cd4f938320b 100644
--- a/flang/include/flang/Lower/ConvertCall.h
+++ b/flang/include/flang/Lower/ConvertCall.h
@@ -24,6 +24,11 @@
 
 namespace Fortran::lower {
 
+/// Data structure packaging the SSA value(s) produced for the result of lowered
+/// function calls.
+using LoweredResult =
+    std::variant<fir::ExtendedValue, hlfir::EntityWithAttributes>;
+
 /// Given a call site for which the arguments were already lowered, generate
 /// the call and return the result. This function deals with explicit result
 /// allocation and lowering if needed. It also deals with passing the host
@@ -32,7 +37,7 @@ namespace Fortran::lower {
 /// It is only used for HLFIR.
 /// The returned boolean indicates if finalization has been emitted in
 /// \p stmtCtx for the result.
-std::pair<fir::ExtendedValue, bool> genCallOpAndResult(
+std::pair<LoweredResult, bool> genCallOpAndResult(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
     Fortran::lower::CallerInterface &caller, mlir::FunctionType callSiteType,
diff --git a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h
index e410831c0fc3e..8d17e4e476d10 100644
--- a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h
+++ b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h
@@ -198,6 +198,12 @@ struct AliasAnalysis {
   /// Return the modify-reference behavior of `op` on `location`.
   mlir::ModRefResult getModRef(mlir::Operation *op, mlir::Value location);
 
+  /// Return the modify-reference behavior of operations inside `region` on
+  /// `location`. Contrary to getModRef(operation, location), this will visit
+  /// nested regions recursively according to the HasRecursiveMemoryEffects
+  /// trait.
+  mlir::ModRefResult getModRef(mlir::Region &region, mlir::Value location);
+
   /// Return the memory source of a value.
   /// If getLastInstantiationPoint is true, the search for the source
   /// will stop at [hl]fir.declare if it represents a dummy
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
index 3830237f96f3c..447d5fbab8999 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
@@ -61,6 +61,10 @@ inline mlir::Type getFortranElementOrSequenceType(mlir::Type type) {
   return type;
 }
 
+/// Build the hlfir.expr type for the value held in a variable of type \p
+/// variableType.
+mlir::Type getExprType(mlir::Type variableType);
+
 /// Is this a fir.box or fir.class address type?
 inline bool isBoxAddressType(mlir::Type type) {
   type = fir::dyn_cast_ptrEleTy(type);
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 55fafc2e6b36f..339182605f818 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -20,6 +20,7 @@
 #include "flang/Tools/CrossToolHelpers.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index e84e7afbe82e0..40cd106e63018 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -284,7 +284,8 @@ static void remapActualToDummyDescriptors(
   }
 }
 
-std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
+std::pair<Fortran::lower::LoweredResult, bool>
+Fortran::lower::genCallOpAndResult(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
     Fortran::lower::CallerInterface &caller, mlir::FunctionType callSiteType,
@@ -326,6 +327,11 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     }
   }
 
+  const bool isExprCall =
+      converter.getLoweringOptions().getLowerToHighLevelFIR() &&
+      callSiteType.getNumResults() == 1 &&
+      llvm::isa<fir::SequenceType>(callSiteType.getResult(0));
+
   mlir::IndexType idxTy = builder.getIndexType();
   auto lowerSpecExpr = [&](const auto &expr) -> mlir::Value {
     mlir::Value convertExpr = builder.createConvert(
@@ -333,6 +339,8 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     return fir::factory::genMaxWithZero(builder, loc, convertExpr);
   };
   llvm::SmallVector<mlir::Value> resultLengths;
+  mlir::Value arrayResultShape;
+  hlfir::EvaluateInMemoryOp evaluateInMemory;
   auto allocatedResult = [&]() -> std::optional<fir::ExtendedValue> {
     llvm::SmallVector<mlir::Value> extents;
     llvm::SmallVector<mlir::Value> lengths;
@@ -366,6 +374,18 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
       resultLengths = lengths;
     }
 
+    if (!extents.empty())
+      arrayResultShape = builder.genShape(loc, extents);
+
+    if (isExprCall) {
+      mlir::Type exprType = hlfir::getExprType(type);
+      evaluateInMemory = builder.create<hlfir::EvaluateInMemoryOp>(
+          loc, exprType, arrayResultShape, resultLengths);
+      builder.setInsertionPointToStart(&evaluateInMemory.getBody().front());
+      return toExtendedValue(loc, evaluateInMemory.getMemory(), extents,
+                             lengths);
+    }
+
     if ((!extents.empty() || !lengths.empty()) && !isElemental) {
       // Note: in the elemental context, the alloca ownership inside the
       // elemental region is implicit, and later pass in lowering (stack
@@ -384,8 +404,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
   if (mustPopSymMap)
     symMap.popScope();
 
-  // Place allocated result or prepare the fir.save_result arguments.
-  mlir::Value arrayResultShape;
+  // Place allocated result
   if (allocatedResult) {
     if (std::optional<Fortran::lower::CallInterface<
             Fortran::lower::CallerInterface>::PassedEntity>
@@ -399,16 +418,6 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
       else
         fir::emitFatalError(
             loc, "only expect character scalar result to be passed by ref");
-    } else {
-      assert(caller.mustSaveResult());
-      arrayResultShape = allocatedResult->match(
-          [&](const fir::CharArrayBoxValue &) {
-            return builder.createShape(loc, *allocatedResult);
-          },
-          [&](const fir::ArrayBoxValue &) {
-            return builder.createShape(loc, *allocatedResult);
-          },
-          [&](const auto &) { return mlir::Value{}; });
     }
   }
 
@@ -642,6 +651,19 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
       callResult = call.getResult(0);
   }
 
+  std::optional<Fortran::evaluate::DynamicType> retTy =
+      caller.getCallDescription().proc().GetType();
+  // With HLFIR lowering, isElemental must be set to true
+  // if we are producing an elemental call. In this case,
+  // the elemental results must not be destroyed, instead,
+  // the resulting array result will be finalized/destroyed
+  // as needed by hlfir.destroy.
+  const bool mustFinalizeResult =
+      !isElemental && callSiteType.getNumResults() > 0 &&
+      !fir::isPointerType(callSiteType.getResult(0)) && retTy.has_value() &&
+      (retTy->category() == Fortran::common::TypeCategory::Derived ||
+       retTy->IsPolymorphic() || retTy->IsUnlimitedPolymorphic());
+
   if (caller.mustSaveResult()) {
     assert(allocatedResult.has_value());
     builder.create<fir::SaveResultOp>(loc, callResult,
@@ -649,6 +671,19 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
                                       arrayResultShape, resultLengths);
   }
 
+  if (evaluateInMemory) {
+    builder.setInsertionPointAfter(evaluateInMemory);
+    mlir::Value expr = evaluateInMemory.getResult();
+    fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
+    if (!isElemental)
+      stmtCtx.attachCleanup([bldr, loc, expr, mustFinalizeResult]() {
+        bldr->create<hlfir::DestroyOp>(loc, expr,
+                                       /*finalize=*/mustFinalizeResult);
+      });
+    return {LoweredResult{hlfir::EntityWithAttributes{expr}},
+            mustFinalizeResult};
+  }
+
   if (allocatedResult) {
     // The result must be optionally destroyed (if it is of a derived type
     // that may need finalization or deallocation of the components).
@@ -679,17 +714,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     // derived-type.
     // For polymorphic and unlimited polymorphic enities call the runtime
     // in any cases.
-    std::optional<Fortran::evaluate::DynamicType> retTy =
-        caller.getCallDescription().proc().GetType();
-    // With HLFIR lowering, isElemental must be set to true
-    // if we are producing an elemental call. In this case,
-    // the elemental results must not be destroyed, instead,
-    // the resulting array result will be finalized/destroyed
-    // as needed by hlfir.destroy.
-    if (!isElemental && !fir::isPointerType(funcType.getResults()[0]) &&
-        retTy &&
-        (retTy->category() == Fortran::common::TypeCategory::Derived ||
-         retTy->IsPolymorphic() || retTy->IsUnlimitedPolymorphic())) {
+    if (mustFinalizeResult) {
       if (retTy->IsPolymorphic() || retTy->IsUnlimitedPolymorphic()) {
         auto *bldr = &converter.getFirOpBuilder();
         stmtCtx.attachCleanup([bldr, loc, allocatedResult]() {
@@ -715,12 +740,13 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
         }
       }
     }
-    return {*allocatedResult, resultIsFinalized};
+    return {LoweredResult{*allocatedResult}, resultIsFinalized};
   }
 
   // subroutine call
   if (!resultType)
-    return {fir::ExtendedValue{mlir::Value{}}, /*resultIsFinalized=*/false};
+    return {LoweredResult{fir::ExtendedValue{mlir::Value{}}},
+            /*resultIsFinalized=*/false};
 
   // For now, Fortran return values are implemented with a single MLIR
   // function return value.
@@ -734,10 +760,13 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
         mlir::dyn_cast<fir::CharacterType>(funcType.getResults()[0]);
     mlir::Value len = builder.createIntegerConstant(
         loc, builder.getCharacterLengthType(), charTy.getLen());
-    return {fir::CharBoxValue{callResult, len}, /*resultIsFinalized=*/false};
+    return {
+        LoweredResult{fir::ExtendedValue{fir::CharBoxValue{callResult, len}}},
+        /*resultIsFinalized=*/false};
   }
 
-  return {callResult, /*resultIsFinalized=*/false};
+  return {LoweredResult{fir::ExtendedValue{callResult}},
+          /*resultIsFinalized=*/false};
 }
 
 static hlfir::EntityWithAttributes genStmtFunctionRef(
@@ -1661,19 +1690,25 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
   // Prepare lowered arguments according to the interface
   // and map the lowered values to the dummy
   // arguments.
-  auto [result, resultIsFinalized] = Fortran::lower::genCallOpAndResult(
+  auto [loweredResult, resultIsFinalized] = Fortran::lower::genCallOpAndResult(
       loc, callContext.converter, callContext.symMap, callContext.stmtCtx,
       caller, callSiteType, callContext.resultType,
       callContext.isElementalProcWithArrayArgs());
-  // For procedure pointer function result, just return the call.
-  if (callContext.resultType &&
-      mlir::isa<fir::BoxProcType>(*callContext.resultType))
-    return hlfir::EntityWithAttributes(fir::getBase(result));
 
   /// Clean-up associations and copy-in.
   for (auto cleanUp : callCleanUps)
     cleanUp.genCleanUp(loc, builder);
 
+  if (auto *entity = std::get_if<hlfir::EntityWithAttributes>(&loweredResult))
+    return *entity;
+
+  auto &result = std::get<fir::ExtendedValue>(loweredResult);
+
+  // For procedure pointer function result, just return the call.
+  if (callContext.resultType &&
+      mlir::isa<fir::BoxProcType>(*callContext.resultType))
+    return hlfir::EntityWithAttributes(fir::getBase(result));
+
   if (!fir::getBase(result))
     return std::nullopt; // subroutine call.
 
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 46168b81dd3a0..7698fac89c223 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -2852,10 +2852,11 @@ class ScalarExprLowering {
       }
     }
 
-    ExtValue result =
+    auto loweredResult =
         Fortran::lower::genCallOpAndResult(loc, converter, symMap, stmtCtx,
                                            caller, callSiteType, resultType)
             .first;
+    auto &result = std::get<ExtValue>(loweredResult);
 
     // Sync pointers and allocatables that may have been modified during the
     // call.
@@ -4881,10 +4882,12 @@ class ArrayExprLowering {
             [&](const auto &) { return fir::getBase(exv); });
         caller.placeInput(argIface, arg);
       }
-      return Fortran::lower::genCallOpAndResult(loc, converter, symMap,
-                                                getElementCtx(), caller,
-                                                callSiteType, retTy)
-          .first;
+      Fortran::lower::LoweredResult res =
+          Fortran::lower::genCallOpAndResult(loc, converter, symMap,
+                                             getElementCtx(), caller,
+                                             callSiteType, retTy)
+              .first;
+      return std::get<ExtValue>(res);
     };
   }
 
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index e93fbc562f9b1..4ab319b016caf 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1696,18 +1696,17 @@ class HlfirBuilder {
     // required chains of hlfir.designate to address the parent components so
     // that the StructureConstructor can later be lowered by addressing these
     // parent components if needed. Note: the front-end orders the components in
-    // structure constructors. The code below relies on the component to appear
-    // in order.
+    // structure constructors.
     using ValueAndParent = std::tuple<const Fortran::lower::SomeExpr &,
                                       const Fortran::semantics::Symbol &,
                                       hlfir::EntityWithAttributes>;
     llvm::SmallVector<ValueAndParent> valuesAndParents;
-    Fortran::lower::ComponentReverseIterator compIterator(
-        ctor.result().derivedTypeSpec());
-    hlfir::EntityWithAttributes currentParent = varOp;
     for (const auto &value : llvm::reverse(ctor.values())) {
       const Fortran::semantics::Symbol &compSym = *value.first;
-      while (!compIterator.lookup(compSym.name())) {
+      hlfir::EntityWithAttributes currentParent = varOp;
+      for (Fortran::lower::ComponentReverseIterator compIterator(
+               ctor.result().derivedTypeSpec());
+           !compIterator.lookup(compSym.name());) {
         const auto &parentType = compIterator.advanceToParentType();
         llvm::StringRef parentName = toStringRef(parentType.name());
         auto baseRecTy = mlir::cast<fir::RecordType>(
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 2b24791d6c7c5..0b0f83d024ce3 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -91,6 +91,13 @@ bool AliasAnalysis::Source::isDummyArgument() const {
   return false;
 }
 
+static bool isEvaluateInMemoryBlockArg(mlir::Value v) {
+  if (auto evalInMem = llvm::dyn_cast_or_null<hlfir::EvaluateInMemoryOp>(
+          v.getParentRegion()->getParentOp()))
+    return evalInMem.getMemory() == v;
+  return false;
+}
+
 bool AliasAnalysis::Source::isData() const { return origin.isData; }
 bool AliasAnalysis::Source::isBoxData() const {
   return mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(valueType)) &&
@@ -457,6 +464,33 @@ ModRefResult AliasAnalysis::getModRef(Operation *op, Value location) {
   return result;
 }
 
+ModRefResult AliasAnalysis::getModRef(mlir::Region &region,
+                                      mlir::Value location) {
+  ModRefResult result = ModRefResult::getNoModRef();
+  for (mlir::Operation &op : region.getOps()) {
+    if (op.hasTrait<mlir::OpTrait::HasRecursiveMemoryEffects>()) {
+      for (mlir::Region &subRegion : op.getRegions()) {
+        result = result.merge(getModRef(subRegion, location));
+        // Fast return is already mod and ref.
+        if (result.isModAndRef())
+          return result;
+      }
+      // In MLIR, RecursiveMemoryEffects can be combined with
+      // MemoryEffectOpInterface to describe extra effects on top of the
+      // effects of the nested operations.  However, the presence of
+      // RecursiveMemoryEffects and the absence of MemoryEffectOpInterface
+      // implies the operation has no other memory effects than the one of its
+      // nested operations.
+      if (!mlir::isa<mlir::MemoryEffectOpInterface>(op))
+        continue;
+    }
+    result = result.merge(getModRef(&op, location));
+    if (result.isModAndRef())
+      return result;
+  }
+  return result;
+}
+
 AliasAnalysis::Source::Attributes
 getAttrsFromVariable(fir::FortranVariableOpInterface var) {
   AliasAnalysis::Source::Attributes attrs;
@@ -698,7 +732,7 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           breakFromLoop = true;
         });
   }
-  if (!defOp && type == SourceKind::Unknown)
+  if (!defOp && type == SourceKind::Unknown) {
     // Check if the memory source is coming through a dummy argument.
     if (isDummyArgument(v)) {
       type = SourceKind::Argument;
@@ -708,7 +742,12 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
 
       if (isPointerReference(ty))
         attributes.set(Attribute::Pointer);
+    } else if (isEvaluateInMemoryBlockArg(v)) {
+      // hlfir.eval_in_mem block operands is allocated by the operation.
+      type = SourceKind::Allocate;
+      ty = v.getType();
     }
+  }
 
   if (type == SourceKind::Global) {
     return {{global, instantiationPoint, followingData},
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
index 0b61c0edce622..d67b5fa659807 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
@@ -215,3 +215,16 @@ bool hlfir::mayHaveAllocatableComponent(mlir::Type ty) {
   return fir::isPolymorphicType(ty) || fir::isUnlimitedPolymorphicType(ty) ||
          fir::isRecordWithAllocatableMember(hlfir::getFortranElementType(ty));
 }
+
+mlir::Type hlfir::getExprType(mlir::Type variableType) {
+  hlfir::ExprType::Shape typeShape;
+  bool isPolymorphic = fir::isPolymorphicType(variableType);
+  mlir::Type type = getFortranElementOrSequenceType(variableType);
+  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type)) {
+    assert(!seqType.hasUnknownShape() && "assumed-rank cannot be expressions");
+    typeShape.append(seqType.getShape().begin(), seqType.getShape().end());
+    type = seqType.getEleTy();
+  }
+  return hlfir::ExprType::get(variableType.getContext(), typeShape, type,
+                              isPolymorphic);
+}
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 8751988244648..3a172d1b8b540 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1427,16 +1427,7 @@ llvm::LogicalResult hlfir::EndAssociateOp::verify() {
 void hlfir::AsExprOp::build(mlir::OpBuilder &builder,
                             mlir::OperationState &result, mlir::Value var,
                             mlir::Value mustFree) {
-  hlfir::ExprType::Shape typeShape;
-  bool isPolymorphic = fir::isPolymorphicType(var.getType());
-  mlir::Type type = getFortranElementOrSequenceType(var.getType());
-  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type)) {
-    typeShape.append(seqType.getShape().begin(), seqType.getShape().end());
-    type = seqType.getEleTy();
-  }
-
-  auto resultType = hlfir::ExprType::get(builder.getContext(), typeShape, type,
-                                         isPolymorphic);
+  mlir::Type resultType = hlfir::getExprType(var.getType());
   return build(builder, result, resultType, var, mustFree);
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index a0160b233e3cd..9327e7ad5875c 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -1108,6 +1108,100 @@ class ReductionMaskConversion : public mlir::OpRewritePattern<Op> {
   }
 };
 
+class EvaluateIntoMemoryAssignBufferization
+    : public mlir::OpRewritePattern<hlfir::EvaluateInMemoryOp> {
+
+public:
+  using mlir::OpRewritePattern<hlfir::EvaluateInMemoryOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::EvaluateInMemoryOp,
+                  mlir::PatternRewriter &rewriter) const override;
+};
+
+static llvm::LogicalResult
+tryUsingAssignLhsDirectly(hlfir::EvaluateInMemoryOp evalInMem,
+                          mlir::PatternRewriter &rewriter) {
+  mlir::Location loc = evalInMem.getLoc();
+  hlfir::DestroyOp destroy;
+  hlfir::AssignOp assign;
+  for (auto user : llvm::enumerate(evalInMem->getUsers())) {
+    if (user.index() > 2)
+      return mlir::failure();
+    mlir::TypeSwitch<mlir::Operation *, void>(user.value())
+        .Case([&](hlfir::AssignOp op) { assign = op; })
+        .Case([&](hlfir::DestroyOp op) { destroy = op; });
+  }
+  if (!assign || !destroy || destroy.mustFinalizeExpr() ||
+      assign.isAllocatableAssignment())
+    return mlir::failure();
+
+  hlfir::Entity lhs{assign.getLhs()};
+  // EvaluateInMemoryOp memory is contiguous, so in general, it can only be
+  // replace by the LHS if the LHS is contiguous.
+  if (!lhs.isSimplyContiguous())
+    return mlir::failure();
+  // Character assignment may involves truncation/padding, so the LHS
+  // cannot be used to evaluate RHS in place without proving the LHS and
+  // RHS lengths are the same.
+  if (lhs.isCharacter())
+    return mlir::failure();
+  fir::AliasAnalysis aliasAnalysis;
+  // The region must not read or write the LHS.
+  // Note that getModRef is used instead of mlir::MemoryEffects because
+  // EvaluateInMemoryOp is typically expected to hold fir.calls and that
+  // Fortran calls cannot be modeled in a useful way with mlir::MemoryEffects:
+  // it is hard/impossible to list all the read/written SSA values in a call,
+  // but it is often possible to tell that an SSA value cannot be accessed,
+  // hence getModRef is needed here and below. Also note that getModRef uses
+  // mlir::MemoryEffects for operations that do not have special handling in
+  // getModRef.
+  if (aliasAnalysis.getModRef(evalInMem.getBody(), lhs).isModOrRef())
+    return mlir::failure();
+  // Any variables affected between the hlfir.evalInMem and assignment must not
+  // be read or written inside the region since it will be moved at the
+  // assignment insertion point.
+  auto effects = getEffectsBetween(evalInMem->getNextNode(), assign);
+  if (!effects) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "operation with unknown effects between eval_in_mem and assign\n");
+    return mlir::failure();
+  }
+  for (const mlir::MemoryEffects::EffectInstance &effect : *effects) {
+    mlir::Value affected = effect.getValue();
+    if (!affected ||
+        aliasAnalysis.getModRef(evalInMem.getBody(), affected).isModOrRef())
+      return mlir::failure();
+  }
+
+  rewriter.setInsertionPoint(assign);
+  fir::FirOpBuilder builder(rewriter, evalInMem.getOperation());
+  mlir::Value rawLhs = hlfir::genVariableRawAddress(loc, builder, lhs);
+  hlfir::computeEvaluateOpIn(loc, builder, evalInMem, rawLhs);
+  rewriter.eraseOp(assign);
+  rewriter.eraseOp(destroy);
+  rewriter.eraseOp(evalInMem);
+  return mlir::success();
+}
+
+llvm::LogicalResult EvaluateIntoMemoryAssignBufferization::matchAndRewrite(
+    hlfir::EvaluateInMemoryOp evalInMem,
+    mlir::PatternRewriter &rewriter) const {
+  if (mlir::succeeded(tryUsingAssignLhsDirectly(evalInMem, rewriter)))
+    return mlir::success();
+  // Rewrite to temp + as_expr here so that the assign + as_expr pattern can
+  // kick-in for simple types and at least implement the assignment inline
+  // instead of call Assign runtime.
+  fir::FirOpBuilder builder(rewriter, evalInMem.getOperation());
+  mlir::Location loc = evalInMem.getLoc();
+  auto [temp, isHeapAllocated] = hlfir::computeEvaluateOpInNewTemp(
+      loc, builder, evalInMem, evalInMem.getShape(), evalInMem.getTypeparams());
+  rewriter.replaceOpWithNewOp<hlfir::AsExprOp>(
+      evalInMem, temp, /*mustFree=*/builder.createBool(loc, isHeapAllocated));
+  return mlir::success();
+}
+
 class OptimizedBufferizationPass
     : public hlfir::impl::OptimizedBufferizationBase<
           OptimizedBufferizationPass> {
@@ -1130,6 +1224,7 @@ class OptimizedBufferizationPass
     patterns.insert<ElementalAssignBufferization>(context);
     patterns.insert<BroadcastAssignBufferization>(context);
     patterns.insert<VariableAssignBufferization>(context);
+    patterns.insert<EvaluateIntoMemoryAssignBufferization>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AllOp>>(context);
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 0b7b3bafde008..0743fb60aa847 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -16,7 +16,8 @@ namespace fir {
 void addNestedPassToAllTopLevelOperations(mlir::PassManager &pm,
                                           PassConstructor ctor) {
   addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
-                     mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
+                     mlir::omp::PrivateClauseOp, fir::GlobalOp,
+                     mlir::gpu::GPUModuleOp>(pm, ctor);
 }
 
 void addNestedPassToAllTopLevelOperationsConditionally(
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index e64280508755a..2eca349110f3a 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -331,9 +332,10 @@ class AbstractResultOpt
   using fir::impl::AbstractResultOptBase<
       AbstractResultOpt>::AbstractResultOptBase;
 
-  void runOnSpecificOperation(mlir::func::FuncOp func, bool shouldBoxResult,
-                              mlir::RewritePatternSet &patterns,
-                              mlir::ConversionTarget &target) {
+  template <typename OpTy>
+  void runOnFunctionLikeOperation(OpTy func, bool shouldBoxResult,
+                                  mlir::RewritePatternSet &patterns,
+                                  mlir::ConversionTarget &target) {
     auto loc = func.getLoc();
     auto *context = &getContext();
     // Convert function type itself if it has an abstract result.
@@ -384,6 +386,18 @@ class AbstractResultOpt
     }
   }
 
+  void runOnSpecificOperation(mlir::func::FuncOp func, bool shouldBoxResult,
+                              mlir::RewritePatternSet &patterns,
+                              mlir::ConversionTarget &target) {
+    runOnFunctionLikeOperation(func, shouldBoxResult, patterns, target);
+  }
+
+  void runOnSpecificOperation(mlir::gpu::GPUFuncOp func, bool shouldBoxResult,
+                              mlir::RewritePatternSet &patterns,
+                              mlir::ConversionTarget &target) {
+    runOnFunctionLikeOperation(func, shouldBoxResult, patterns, target);
+  }
+
   inline static bool containsFunctionTypeWithAbstractResult(mlir::Type type) {
     return mlir::TypeSwitch<mlir::Type, bool>(type)
         .Case([](fir::BoxProcType boxProc) {
@@ -448,6 +462,14 @@ class AbstractResultOpt
     mlir::TypeSwitch<mlir::Operation *, void>(op)
         .Case<mlir::func::FuncOp, fir::GlobalOp>([&](auto op) {
           runOnSpecificOperation(op, shouldBoxResult, patterns, target);
+        })
+        .Case<mlir::gpu::GPUModuleOp>([&](auto op) {
+          auto gpuMod = mlir::dyn_cast<mlir::gpu::GPUModuleOp>(*op);
+          for (auto funcOp : gpuMod.template getOps<mlir::func::FuncOp>())
+            runOnSpecificOperation(funcOp, shouldBoxResult, patterns, target);
+          for (auto gpuFuncOp : gpuMod.template getOps<mlir::gpu::GPUFuncOp>())
+            runOnSpecificOperation(gpuFuncOp, shouldBoxResult, patterns,
+                                   target);
         });
 
     // Convert the calls and, if needed,  the ReturnOp in the function body.
diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
index 5520d750e2ce1..1f09e7ad4c2f5 100644
--- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90
+++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
@@ -17,12 +17,14 @@
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'func.func' Pipeline
 ! CHECK-NEXT:   ArrayValueCopy
 ! CHECK-NEXT:   CharacterConversion
+! CHECK-NEXT: 'gpu.module' Pipeline
+! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'omp.private' Pipeline
@@ -48,13 +50,16 @@
 ! CHECK-NEXT: PolymorphicOpConversion
 ! CHECK-NEXT: AssumedRankOpConversion
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   StackReclaim
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'func.func' Pipeline
 ! CHECK-NEXT:   StackReclaim
 ! CHECK-NEXT:   CFGConversion
+! CHECK-NEXT: 'gpu.module' Pipeline
+! CHECK-NEXT:   StackReclaim
+! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   StackReclaim
 ! CHECK-NEXT:   CFGConversion
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index ab5ddedf5fc18..4326953421e4b 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -28,11 +28,13 @@
 ! ALL: Pass statistics report
 
 ! ALL: Fortran::lower::VerifierPass
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   InlineElementals
+! ALL-NEXT: 'gpu.module' Pipeline
+! ALL-NEXT:   InlineElementals
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   InlineElementals
 ! ALL-NEXT: 'omp.private' Pipeline
@@ -49,12 +51,14 @@
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   ArrayValueCopy
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'gpu.module' Pipeline
+! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.private' Pipeline
@@ -78,13 +82,16 @@
 ! ALL-NEXT: PolymorphicOpConversion
 ! ALL-NEXT: AssumedRankOpConversion
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     StackReclaim
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     StackReclaim
 ! ALL-NEXT:     CFGConversion
+! ALL-NEXT:   'gpu.module' Pipeline
+! ALL-NEXT:     StackReclaim
+! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     StackReclaim
 ! ALL-NEXT:     CFGConversion
@@ -99,11 +106,13 @@
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
+! ALL-NEXT:   'gpu.module' Pipeline
+! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'omp.private' Pipeline
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 33c8183b27aef..6ffdbb0234e85 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -16,13 +16,16 @@
 
 ! ALL: Fortran::lower::VerifierPass
 ! O2-NEXT: Canonicalizer
-! ALL:     Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL:     Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:'fir.global' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
 ! ALL-NEXT:'func.func' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! ALL-NEXT:'gpu.module' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
+! ALL:       InlineElementals
 ! ALL-NEXT:'omp.declare_reduction' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
@@ -33,11 +36,13 @@
 ! O2-NEXT: CSE
 ! O2-NEXT: (S) {{.*}} num-cse'd
 ! O2-NEXT: (S) {{.*}} num-dce'd
-! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! O2-NEXT: 'fir.global' Pipeline
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT: 'func.func' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT: 'gpu.module' Pipeline
+! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT: 'omp.declare_reduction' Pipeline
 ! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT: 'omp.private' Pipeline
@@ -54,12 +59,14 @@
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
 ! ALL-NEXT:   ArrayValueCopy
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'gpu.module' Pipeline
+! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.private' Pipeline
@@ -86,13 +93,16 @@
 ! ALL-NEXT: AssumedRankOpConversion
 ! O2-NEXT:  AddAliasTags
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:    'fir.global' Pipeline
 ! ALL-NEXT:      StackReclaim
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:    'func.func' Pipeline
 ! ALL-NEXT:      StackReclaim
 ! ALL-NEXT:      CFGConversion
+! ALL-NEXT:   'gpu.module' Pipeline
+! ALL-NEXT:      StackReclaim
+! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:      StackReclaim
 ! ALL-NEXT:      CFGConversion
@@ -108,11 +118,13 @@
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'func.func' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
+! ALL-NEXT:  'gpu.module' Pipeline
+! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'omp.declare_reduction' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'omp.private' Pipeline
diff --git a/flang/test/Driver/pp-fixed-form.f90 b/flang/test/Driver/pp-fixed-form.f90
new file mode 100644
index 0000000000000..4695da78763ae
--- /dev/null
+++ b/flang/test/Driver/pp-fixed-form.f90
@@ -0,0 +1,19 @@
+!RUN: %flang -save-temps -### %S/Inputs/free-form-test.f90  2>&1 | FileCheck %s --check-prefix=FREE
+FREE:       "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/free-form-test.f90"
+FREE-NEXT:  "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95" "free-form-test.i"
+
+!RUN: %flang -save-temps -### %S/Inputs/fixed-form-test.f  2>&1 | FileCheck %s --check-prefix=FIXED
+FIXED:      "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/fixed-form-test.f"
+FIXED-NEXT: "-fc1" {{.*}} "-ffixed-form" {{.*}} "-x" "f95" "fixed-form-test.i"
+
+!RUN: %flang -save-temps -### -ffree-form %S/Inputs/free-form-test.f90  2>&1 | FileCheck %s --check-prefix=FREE-FLAG
+FREE-FLAG:           "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/free-form-test.f90"
+FREE-FLAG-NEXT:      "-fc1" {{.*}} "-emit-llvm-bc" "-ffree-form"
+FREE-FLAG-NOT:       "-ffixed-form"
+FREE-FLAG-SAME:      "-x" "f95" "free-form-test.i"
+
+!RUN: %flang -save-temps -### -ffixed-form %S/Inputs/fixed-form-test.f  2>&1 | FileCheck %s --check-prefix=FIXED-FLAG
+FIXED-FLAG:          "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95-cpp-input" "{{.*}}/fixed-form-test.f"
+FIXED-FLAG-NEXT:     "-fc1" {{.*}} "-emit-llvm-bc" "-ffixed-form"
+FIXED-FLAG-NOT:      "-ffixed-form"
+FIXED-FLAG-SAME:     "-x" "f95" "fixed-form-test.i"
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index ad5201af8311f..50b91ce340b3a 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -17,13 +17,16 @@ func.func @_QQmain() {
 // PASSES: Pass statistics report
 
 // PASSES:        Canonicalizer
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT: 'gpu.module' Pipeline
+// PASSES-NEXT:   SimplifyHLFIRIntrinsics
+// PASSES-NEXT:   InlineElementals
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
@@ -34,11 +37,13 @@ func.func @_QQmain() {
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:    (S) 0 num-dce'd - Number of operations DCE'd
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'gpu.module' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT: 'omp.private' Pipeline
@@ -52,12 +57,14 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   ArrayValueCopy
 // PASSES-NEXT:   CharacterConversion
+// PASSES-NEXT: 'gpu.module' Pipeline
+// PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'omp.private' Pipeline
@@ -84,13 +91,16 @@ func.func @_QQmain() {
 // PASSES-NEXT: AssumedRankOpConversion
 // PASSES-NEXT: AddAliasTags
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   StackReclaim
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   StackReclaim
 // PASSES-NEXT:   CFGConversion
+// PASSES-NEXT: 'gpu.module' Pipeline
+// PASSES-NEXT:   StackReclaim
+// PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   StackReclaim
 // PASSES-NEXT:   CFGConversion
@@ -106,11 +116,13 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: BoxedProcedurePass
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT:  'fir.global' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'func.func' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
+// PASSES-NEXT:  'gpu.module' Pipeline
+// PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'omp.private' Pipeline
diff --git a/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
new file mode 100644
index 0000000000000..984c0bcbaddcc
--- /dev/null
+++ b/flang/test/HLFIR/opt-bufferization-eval_in_mem.fir
@@ -0,0 +1,67 @@
+// RUN: fir-opt --opt-bufferization %s | FileCheck %s
+
+// Fortran F2023 15.5.2.14 point 4. ensures that _QPfoo cannot access _QFtestEx
+// and the temporary storage for the result can be avoided.
+func.func @_QPtest(%arg0: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"}) {
+  %c10 = arith.constant 10 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {uniq_name = "_QFtestEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %3 = hlfir.eval_in_mem shape %1 : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+  ^bb0(%arg1: !fir.ref<!fir.array<10xf32>>):
+    %4 = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+    fir.save_result %4 to %arg1(%1) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+  }
+  hlfir.assign %3 to %2#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+  hlfir.destroy %3 : !hlfir.expr<10xf32>
+  return
+}
+func.func private @_QPfoo() -> !fir.array<10xf32>
+
+// CHECK-LABEL: func.func @_QPtest(
+// CHECK-SAME:                     %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"}) {
+// CHECK:         %[[VAL_1:.*]] = arith.constant 10 : index
+// CHECK:         %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:         %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:         %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFtestEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+// CHECK:         %[[VAL_5:.*]] = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+// CHECK:         fir.save_result %[[VAL_5]] to %[[VAL_4]]#1(%[[VAL_3]]) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+// CHECK:         return
+// CHECK:       }
+
+
+// Temporary storage cannot be avoided in this case since
+// _QFnegative_test_is_targetEx has the TARGET attribute.
+func.func @_QPnegative_test_is_target(%arg0: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x", fir.target}) {
+  %c10 = arith.constant 10 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %arg0(%1) dummy_scope %0 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFnegative_test_is_targetEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %3 = hlfir.eval_in_mem shape %1 : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+  ^bb0(%arg1: !fir.ref<!fir.array<10xf32>>):
+    %4 = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+    fir.save_result %4 to %arg1(%1) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+  }
+  hlfir.assign %3 to %2#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+  hlfir.destroy %3 : !hlfir.expr<10xf32>
+  return
+}
+// CHECK-LABEL: func.func @_QPnegative_test_is_target(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x", fir.target}) {
+// CHECK:         %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:         %[[VAL_2:.*]] = arith.constant false
+// CHECK:         %[[VAL_3:.*]] = arith.constant 10 : index
+// CHECK:         %[[VAL_4:.*]] = fir.alloca !fir.array<10xf32>
+// CHECK:         %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]]{{.*}}
+// CHECK:         %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_4]]{{.*}}
+// CHECK:         %[[VAL_9:.*]] = fir.call @_QPfoo() fastmath<contract> : () -> !fir.array<10xf32>
+// CHECK:         fir.save_result %[[VAL_9]] to %[[VAL_8]]#1{{.*}}
+// CHECK:         %[[VAL_10:.*]] = hlfir.as_expr %[[VAL_8]]#0 move %[[VAL_2]] : (!fir.ref<!fir.array<10xf32>>, i1) -> !hlfir.expr<10xf32>
+// CHECK:         fir.do_loop %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_3]] step %[[VAL_1]] unordered {
+// CHECK:           %[[VAL_12:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_11]] : (!hlfir.expr<10xf32>, index) -> f32
+// CHECK:           %[[VAL_13:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_11]])  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_13]] : f32, !fir.ref<f32>
+// CHECK:         }
+// CHECK:         hlfir.destroy %[[VAL_10]] : !hlfir.expr<10xf32>
+// CHECK:         return
+// CHECK:       }
diff --git a/flang/test/HLFIR/order_assignments/where-scheduling.f90 b/flang/test/HLFIR/order_assignments/where-scheduling.f90
index 3010476d4a188..6feaba0d3389a 100644
--- a/flang/test/HLFIR/order_assignments/where-scheduling.f90
+++ b/flang/test/HLFIR/order_assignments/where-scheduling.f90
@@ -134,7 +134,7 @@ end function f
 !CHECK-NEXT: run 1 save    : where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
 !CHECK-LABEL: ------------ scheduling where in _QPonly_once ------------
-!CHECK-NEXT: unknown effect: %{{[0-9]+}} = llvm.intr.stacksave : !llvm.ptr
+!CHECK-NEXT: unknown effect: %11 = fir.call @_QPcall_me_only_once() fastmath<contract> : () -> !fir.array<10x!fir.logical<4>>
 !CHECK-NEXT: saving eval because write effect prevents re-evaluation
 !CHECK-NEXT: run 1 save  (w): where/mask
 !CHECK-NEXT: run 2 evaluate: where/region_assign1
diff --git a/flang/test/Lower/HLFIR/calls-array-results.f90 b/flang/test/Lower/HLFIR/calls-array-results.f90
new file mode 100644
index 0000000000000..d91844cc2e6f8
--- /dev/null
+++ b/flang/test/Lower/HLFIR/calls-array-results.f90
@@ -0,0 +1,131 @@
+! RUN: bbc -emit-hlfir -o - %s -I nowhere | FileCheck %s
+
+subroutine simple_test()
+  implicit none
+  interface
+    function array_func()
+      real :: array_func(10)
+    end function
+  end interface
+  real :: x(10)
+  x = array_func()
+end subroutine
+
+subroutine arg_test(n)
+  implicit none
+  interface
+    function array_func_2(n)
+      integer(8) :: n
+      real :: array_func_2(n)
+    end function
+  end interface
+  integer(8) :: n
+  real :: x(n)
+  x = array_func_2(n)
+end subroutine
+
+module type_defs
+  interface
+    function array_func()
+      real :: array_func(10)
+    end function
+  end interface
+  type t
+    contains
+    procedure, nopass :: array_func => array_func
+  end type
+end module
+
+subroutine dispatch_test(x, a)
+  use type_defs, only : t
+  implicit none
+  real :: x(10)
+  class(t) :: a
+  x = a%array_func()
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_test() {
+! CHECK:           %[[VAL_0:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "x", uniq_name = "_QFsimple_testEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_2]]) {uniq_name = "_QFsimple_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_6:.*]] = arith.subi %[[VAL_4]], %[[VAL_5]] : i64
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i64) -> index
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_10]] : index
+! CHECK:           %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_9]], %[[VAL_10]] : index
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]] = hlfir.eval_in_mem shape %[[VAL_13]] : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK:           ^bb0(%[[VAL_15:.*]]: !fir.ref<!fir.array<10xf32>>):
+! CHECK:             %[[VAL_16:.*]] = fir.call @_QParray_func() fastmath<contract> : () -> !fir.array<10xf32>
+! CHECK:             fir.save_result %[[VAL_16]] to %[[VAL_15]](%[[VAL_13]]) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+! CHECK:           }
+! CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_3]]#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+! CHECK:           hlfir.destroy %[[VAL_14]] : !hlfir.expr<10xf32>
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QParg_test(
+! CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFarg_testEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i64>
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : index
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.array<?xf32>, %[[VAL_7]] {bindc_name = "x", uniq_name = "_QFarg_testEx"}
+! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) {uniq_name = "_QFarg_testEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]]#1 {uniq_name = "_QFarg_testFarray_func_2En"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_12:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i64>
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:           %[[VAL_15:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_15]] : i64
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i64) -> index
+! CHECK:           %[[VAL_18:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_19:.*]] = arith.cmpi sgt, %[[VAL_17]], %[[VAL_18]] : index
+! CHECK:           %[[VAL_20:.*]] = arith.select %[[VAL_19]], %[[VAL_17]], %[[VAL_18]] : index
+! CHECK:           %[[VAL_21:.*]] = fir.shape %[[VAL_20]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_22:.*]] = hlfir.eval_in_mem shape %[[VAL_21]] : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+! CHECK:           ^bb0(%[[VAL_23:.*]]: !fir.ref<!fir.array<?xf32>>):
+! CHECK:             %[[VAL_24:.*]] = fir.call @_QParray_func_2(%[[VAL_2]]#1) fastmath<contract> : (!fir.ref<i64>) -> !fir.array<?xf32>
+! CHECK:             fir.save_result %[[VAL_24]] to %[[VAL_23]](%[[VAL_21]]) : !fir.array<?xf32>, !fir.ref<!fir.array<?xf32>>, !fir.shape<1>
+! CHECK:           }
+! CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_10]]#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+! CHECK:           hlfir.destroy %[[VAL_22]] : !hlfir.expr<?xf32>
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPdispatch_test(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.class<!fir.type<_QMtype_defsTt>> {fir.bindc_name = "a"}) {
+! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEa"} : (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.dscope) -> (!fir.class<!fir.type<_QMtype_defsTt>>, !fir.class<!fir.type<_QMtype_defsTt>>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) dummy_scope %[[VAL_2]] {uniq_name = "_QFdispatch_testEx"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i64
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_16:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_17:.*]] = hlfir.eval_in_mem shape %[[VAL_16]] : (!fir.shape<1>) -> !hlfir.expr<10xf32> {
+! CHECK:           ^bb0(%[[VAL_18:.*]]: !fir.ref<!fir.array<10xf32>>):
+! CHECK:             %[[VAL_19:.*]] = fir.dispatch "array_func"(%[[VAL_3]]#1 : !fir.class<!fir.type<_QMtype_defsTt>>) -> !fir.array<10xf32>
+! CHECK:             fir.save_result %[[VAL_19]] to %[[VAL_18]](%[[VAL_16]]) : !fir.array<10xf32>, !fir.ref<!fir.array<10xf32>>, !fir.shape<1>
+! CHECK:           }
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_6]]#0 : !hlfir.expr<10xf32>, !fir.ref<!fir.array<10xf32>>
+! CHECK:           hlfir.destroy %[[VAL_17]] : !hlfir.expr<10xf32>
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/HLFIR/where-nonelemental.f90 b/flang/test/Lower/HLFIR/where-nonelemental.f90
index 643f417c47674..7be5831890012 100644
--- a/flang/test/Lower/HLFIR/where-nonelemental.f90
+++ b/flang/test/Lower/HLFIR/where-nonelemental.f90
@@ -26,11 +26,12 @@ real elemental function elem_func(x)
 ! CHECK-LABEL:   func.func @_QPtest_where(
 ! CHECK:           hlfir.where {
 ! CHECK-NOT: hlfir.exactly_once
-! CHECK:             %[[VAL_17:.*]] = llvm.intr.stacksave : !llvm.ptr
-! CHECK:             %[[VAL_19:.*]] = fir.call @_QPlogical_func1() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
-! CHECK:             hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
-! CHECK:               llvm.intr.stackrestore %[[VAL_17]] : !llvm.ptr
-! CHECK:             }
+! CHECK:               %[[VAL_19:.*]] = hlfir.eval_in_mem {{.*}} {
+! CHECK:                 fir.call @_QPlogical_func1() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:               }
+! CHECK:               hlfir.yield %[[VAL_19]] : !hlfir.expr<100x!fir.logical<4>> cleanup {
+! CHECK:                 hlfir.destroy %[[VAL_19]]
+! CHECK:               }
 ! CHECK:           } do {
 ! CHECK:             hlfir.region_assign {
 ! CHECK:               %[[VAL_24:.*]] = hlfir.exactly_once : f32 {
@@ -70,10 +71,11 @@ real elemental function elem_func(x)
 ! CHECK:             }
 ! CHECK:             hlfir.elsewhere mask {
 ! CHECK:               %[[VAL_62:.*]] = hlfir.exactly_once : !hlfir.expr<100x!fir.logical<4>> {
-! CHECK:                 %[[VAL_72:.*]] = llvm.intr.stacksave : !llvm.ptr
-! CHECK:                 fir.call @_QPlogical_func2() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
-! CHECK:                 hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
-! CHECK:                   llvm.intr.stackrestore %[[VAL_72]] : !llvm.ptr
+! CHECK:                 %[[VAL_72:.*]] = hlfir.eval_in_mem {{.*}} {
+! CHECK:                  fir.call @_QPlogical_func2() fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:                 }
+! CHECK:                 hlfir.yield %[[VAL_72]] : !hlfir.expr<100x!fir.logical<4>> cleanup {
+! CHECK:                   hlfir.destroy %[[VAL_72]]
 ! CHECK:                 }
 ! CHECK:               }
 ! CHECK:               hlfir.yield %[[VAL_62]] : !hlfir.expr<100x!fir.logical<4>>
@@ -123,11 +125,12 @@ integer pure function pure_ifoo()
 ! CHECK:           }  (%[[VAL_10:.*]]: i32) {
 ! CHECK:             %[[VAL_11:.*]] = hlfir.forall_index "i" %[[VAL_10]] : (i32) -> !fir.ref<i32>
 ! CHECK:             hlfir.where {
-! CHECK:               %[[VAL_21:.*]] = llvm.intr.stacksave : !llvm.ptr
 ! CHECK-NOT: hlfir.exactly_once
-! CHECK:               %[[VAL_23:.*]] = fir.call @_QPpure_logical_func1() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
-! CHECK:               hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
-! CHECK:                 llvm.intr.stackrestore %[[VAL_21]] : !llvm.ptr
+! CHECK:               %[[VAL_23:.*]] = hlfir.eval_in_mem {{.*}} {
+! CHECK:                  fir.call @_QPpure_logical_func1() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:               }
+! CHECK:               hlfir.yield %[[VAL_23]] : !hlfir.expr<100x!fir.logical<4>> cleanup {
+! CHECK:                 hlfir.destroy %[[VAL_23]]
 ! CHECK:               }
 ! CHECK:             } do {
 ! CHECK:               hlfir.region_assign {
@@ -172,10 +175,11 @@ integer pure function pure_ifoo()
 ! CHECK:               }
 ! CHECK:               hlfir.elsewhere mask {
 ! CHECK:                 %[[VAL_129:.*]] = hlfir.exactly_once : !hlfir.expr<100x!fir.logical<4>> {
-! CHECK:                   %[[VAL_139:.*]] = llvm.intr.stacksave : !llvm.ptr
-! CHECK:                   %[[VAL_141:.*]] = fir.call @_QPpure_logical_func2() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
-! CHECK:                   hlfir.yield %{{.*}} : !hlfir.expr<100x!fir.logical<4>> cleanup {
-! CHECK:                     llvm.intr.stackrestore %[[VAL_139]] : !llvm.ptr
+! CHECK:                   %[[VAL_139:.*]] = hlfir.eval_in_mem {{.*}} {
+! CHECK:                    fir.call @_QPpure_logical_func2() proc_attrs<pure> fastmath<contract> : () -> !fir.array<100x!fir.logical<4>>
+! CHECK:                   }
+! CHECK:                   hlfir.yield %[[VAL_139]] : !hlfir.expr<100x!fir.logical<4>> cleanup {
+! CHECK:                     hlfir.destroy %[[VAL_139]]
 ! CHECK:                   }
 ! CHECK:                 }
 ! CHECK:                 hlfir.yield %[[VAL_129]] : !hlfir.expr<100x!fir.logical<4>>
diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90
index 95aee84f4a644..2336053c32a54 100644
--- a/flang/test/Lower/explicit-interface-results-2.f90
+++ b/flang/test/Lower/explicit-interface-results-2.f90
@@ -252,12 +252,10 @@ subroutine test_call_to_used_interface(dummy_proc)
   call takes_array(dummy_proc())
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 100 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.alloca !fir.array<100xf32> {bindc_name = ".result"}
-! CHECK:  %[[VAL_3:.*]] = llvm.intr.stacksave : !llvm.ptr
 ! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_0]] : (!fir.boxproc<() -> ()>) -> (() -> !fir.array<100xf32>)
 ! CHECK:  %[[VAL_6:.*]] = fir.call %[[VAL_5]]() {{.*}}: () -> !fir.array<100xf32>
 ! CHECK:  fir.save_result %[[VAL_6]] to %[[VAL_2]](%[[VAL_4]]) : !fir.array<100xf32>, !fir.ref<!fir.array<100xf32>>, !fir.shape<1>
 ! CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<100xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:  fir.call @_QPtakes_array(%[[VAL_7]]) {{.*}}: (!fir.ref<!fir.array<?xf32>>) -> ()
-! CHECK:  llvm.intr.stackrestore %[[VAL_3]] : !llvm.ptr
 end subroutine
diff --git a/flang/test/Lower/explicit-interface-results.f90 b/flang/test/Lower/explicit-interface-results.f90
index 623e875b5f9c9..612d57be36448 100644
--- a/flang/test/Lower/explicit-interface-results.f90
+++ b/flang/test/Lower/explicit-interface-results.f90
@@ -195,8 +195,8 @@ subroutine dyn_array(m, n)
   ! CHECK-DAG: %[[ncast2:.*]] = fir.convert %[[nadd]] : (i64) -> index
   ! CHECK-DAG: %[[ncmpi:.*]] = arith.cmpi sgt, %[[ncast2]], %{{.*}} : index
   ! CHECK-DAG: %[[nselect:.*]] = arith.select %[[ncmpi]], %[[ncast2]], %{{.*}} : index
-  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?xf32>, %[[mselect]], %[[nselect]]
   ! CHECK: %[[shape:.*]] = fir.shape %[[mselect]], %[[nselect]] : (index, index) -> !fir.shape<2>
+  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?xf32>, %[[mselect]], %[[nselect]]
   ! CHECK: %[[res:.*]] = fir.call @_QMcalleePreturn_dyn_array(%[[m]], %[[n]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> !fir.array<?x?xf32>
   ! CHECK: fir.save_result %[[res]] to %[[tmp]](%[[shape]]) : !fir.array<?x?xf32>, !fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>
   print *, return_dyn_array(m, n)
@@ -211,8 +211,8 @@ subroutine dyn_char_cst_array(l)
   ! CHECK: %[[lcast2:.*]] = fir.convert %[[lcast]] : (i64) -> index
   ! CHECK: %[[cmpi:.*]] = arith.cmpi sgt, %[[lcast2]], %{{.*}} : index
   ! CHECK: %[[select:.*]] = arith.select %[[cmpi]], %[[lcast2]], %{{.*}} : index
-  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<20x30x!fir.char<1,?>>(%[[select]] : index)
   ! CHECK: %[[shape:.*]] = fir.shape %{{.*}}, %{{.*}} : (index, index) -> !fir.shape<2>
+  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<20x30x!fir.char<1,?>>(%[[select]] : index)
   ! CHECK: %[[res:.*]] = fir.call @_QMcalleePreturn_dyn_char_cst_array(%[[l]]) {{.*}}: (!fir.ref<i32>) -> !fir.array<20x30x!fir.char<1,?>>
   ! CHECK: fir.save_result %[[res]] to %[[tmp]](%[[shape]]) typeparams %[[select]] : !fir.array<20x30x!fir.char<1,?>>, !fir.ref<!fir.array<20x30x!fir.char<1,?>>>, !fir.shape<2>, index
   print *, return_dyn_char_cst_array(l)
@@ -236,8 +236,8 @@ subroutine cst_char_dyn_array(m, n)
   ! CHECK-DAG: %[[ncast2:.*]] = fir.convert %[[nadd]] : (i64) -> index
   ! CHECK-DAG: %[[ncmpi:.*]] = arith.cmpi sgt, %[[ncast2]], %{{.*}} : index
   ! CHECK-DAG: %[[nselect:.*]] = arith.select %[[ncmpi]], %[[ncast2]], %{{.*}} : index
-  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,10>>, %[[mselect]], %[[nselect]]
   ! CHECK: %[[shape:.*]] = fir.shape %[[mselect]], %[[nselect]] : (index, index) -> !fir.shape<2>
+  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,10>>, %[[mselect]], %[[nselect]]
   ! CHECK: %[[res:.*]] = fir.call @_QMcalleePreturn_cst_char_dyn_array(%[[m]], %[[n]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> !fir.array<?x?x!fir.char<1,10>>
   ! CHECK: fir.save_result %[[res]] to %[[tmp]](%[[shape]]) typeparams {{.*}} : !fir.array<?x?x!fir.char<1,10>>, !fir.ref<!fir.array<?x?x!fir.char<1,10>>>, !fir.shape<2>, index
   print *, return_cst_char_dyn_array(m, n)
@@ -267,8 +267,8 @@ subroutine dyn_char_dyn_array(l, m, n)
   ! CHECK-DAG: %[[lcast2:.*]] = fir.convert %[[lcast]] : (i64) -> index
   ! CHECK-DAG: %[[lcmpi:.*]] = arith.cmpi sgt, %[[lcast2]], %{{.*}} : index
   ! CHECK-DAG: %[[lselect:.*]] = arith.select %[[lcmpi]], %[[lcast2]], %{{.*}} : index
-  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[lselect]] : index), %[[mselect]], %[[nselect]]
   ! CHECK: %[[shape:.*]] = fir.shape %[[mselect]], %[[nselect]] : (index, index) -> !fir.shape<2>
+  ! CHECK: %[[tmp:.*]] = fir.alloca !fir.array<?x?x!fir.char<1,?>>(%[[lselect]] : index), %[[mselect]], %[[nselect]]
   ! CHECK: %[[res:.*]] = fir.call @_QMcalleePreturn_dyn_char_dyn_array(%[[l]], %[[m]], %[[n]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>) -> !fir.array<?x?x!fir.char<1,?>>
   ! CHECK: fir.save_result %[[res]] to %[[tmp]](%[[shape]]) typeparams {{.*}} : !fir.array<?x?x!fir.char<1,?>>, !fir.ref<!fir.array<?x?x!fir.char<1,?>>>, !fir.shape<2>, index
   integer :: l, m, n
diff --git a/flang/test/Lower/forall/array-constructor.f90 b/flang/test/Lower/forall/array-constructor.f90
index 4c8c756ea689c..6b6b46fdd4688 100644
--- a/flang/test/Lower/forall/array-constructor.f90
+++ b/flang/test/Lower/forall/array-constructor.f90
@@ -232,8 +232,8 @@ end subroutine ac2
 ! CHECK:           %[[C0:.*]] = arith.constant 0 : index
 ! CHECK:           %[[CMPI:.*]] = arith.cmpi sgt, %[[VAL_80]], %[[C0]] : index
 ! CHECK:           %[[SELECT:.*]] = arith.select %[[CMPI]], %[[VAL_80]], %[[C0]] : index
-! CHECK:           %[[VAL_81:.*]] = llvm.intr.stacksave : !llvm.ptr
 ! CHECK:           %[[VAL_82:.*]] = fir.shape %[[SELECT]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_81:.*]] = llvm.intr.stacksave : !llvm.ptr
 ! CHECK:           %[[VAL_83:.*]] = fir.convert %[[VAL_74]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
 ! CHECK:           %[[VAL_84:.*]] = fir.call @_QFac2Pfunc(%[[VAL_83]]) {{.*}}: (!fir.box<!fir.array<?xi32>>) -> !fir.array<3xi32>
 ! CHECK:           fir.save_result %[[VAL_84]] to %[[VAL_2]](%[[VAL_82]]) : !fir.array<3xi32>, !fir.ref<!fir.array<3xi32>>, !fir.shape<1>
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 05287d80e14f5..ecb325330f1ad 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -352,9 +352,6 @@ TEST_F(ZeroArguments, ECLGeneralErrorCommandErrorSync) {
 #if defined(_WIN32)
   CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
   CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX");
-#elif defined(_AIX)
-  CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 6);
-  CheckDescriptorEqStr(cmdMsg.get(), "Invalid command lineXXXXXXXXX");
 #else
   CheckDescriptorEqInt<std::int64_t>(cmdStat.get(), 3);
   CheckDescriptorEqStr(cmdMsg.get(), "Command line execution failed");
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index eee5b63bab513..11a355b120360 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -50,52 +50,14 @@ set(LIBC_NAMESPACE ${default_namespace}
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-
-add_subdirectory(newhdrgen)
-
-
-if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)
-  if(NOT LIBC_HDRGEN_EXE)
-    # We need to set up hdrgen first since other targets depend on it.
-    add_subdirectory(utils/LibcTableGenUtil)
-    add_subdirectory(utils/HdrGen)
-    # Calling add_tablegen sets variables like LIBC_TABLEGEN_EXE in
-    # PARENT_SCOPE which get lost until saved in the cache.
-    set(LIBC_TABLEGEN_EXE "${LIBC_TABLEGEN_EXE}" CACHE INTERNAL "")
-    set(LIBC_TABLEGEN_TARGET "${LIBC_TABLEGEN_TARGET}" CACHE INTERNAL "")
-  else()
-    message(STATUS "Will use ${LIBC_HDRGEN_EXE} for libc header generation.")
-  endif()
-endif()
 # We will build the GPU utilities if we are not doing a runtimes build.
 option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF)
-if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD))
+if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
   add_subdirectory(utils/gpu)
-endif()
-
-option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" ON)
-
-set(NEED_LIBC_HDRGEN FALSE)
-if(NOT LLVM_RUNTIMES_BUILD)
-  if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
-    set(NEED_LIBC_HDRGEN TRUE)
-  else()
-    foreach(_name ${LLVM_RUNTIME_TARGETS})
-      if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
-        set(NEED_LIBC_HDRGEN TRUE)
-        break()
-      endif()
-    endforeach()
-  endif()
-endif()
-option(LIBC_HDRGEN_ONLY "Only build the 'libc-hdrgen' executable" OFF)
-if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN)
-  # When libc is build as part of the runtimes/bootstrap build's CMake run, we
-  # only need to build the host tools to build the libc. So, we just do enough
-  # to build libc-hdrgen and return.
   return()
 endif()
-unset(NEED_LIBC_HDRGEN)
+
+add_subdirectory(newhdrgen)
 
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 76c4e1f2d3244..8f24cd4b3023b 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -71,9 +71,9 @@ function(add_header target_name)
   )
 endfunction(add_header)
 
-function(add_gen_header2 target_name)
+function(add_gen_header target_name)
   cmake_parse_arguments(
-    "ADD_GEN_HDR2"
+    "ADD_GEN_HDR"
     "PUBLIC" # No optional arguments
     "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments
     "DEPENDS"     # Multi value arguments
@@ -84,25 +84,25 @@ function(add_gen_header2 target_name)
     add_library(${fq_target_name} INTERFACE)
     return()
   endif()
-  if(NOT ADD_GEN_HDR2_DEF_FILE)
-    message(FATAL_ERROR "`add_gen_hdr2` rule requires DEF_FILE to be specified.")
+  if(NOT ADD_GEN_HDR_DEF_FILE)
+    message(FATAL_ERROR "`add_gen_hdr` rule requires DEF_FILE to be specified.")
   endif()
-  if(NOT ADD_GEN_HDR2_GEN_HDR)
-    message(FATAL_ERROR "`add_gen_hdr2` rule requires GEN_HDR to be specified.")
+  if(NOT ADD_GEN_HDR_GEN_HDR)
+    message(FATAL_ERROR "`add_gen_hdr` rule requires GEN_HDR to be specified.")
   endif()
-  if(NOT ADD_GEN_HDR2_YAML_FILE)
-    message(FATAL_ERROR "`add_gen_hdr2` rule requires YAML_FILE to be specified.")
+  if(NOT ADD_GEN_HDR_YAML_FILE)
+    message(FATAL_ERROR "`add_gen_hdr` rule requires YAML_FILE to be specified.")
   endif()
 
-  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_GEN_HDR})
+  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_GEN_HDR})
   file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
   set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
-  set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR2_YAML_FILE})
-  set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR2_DEF_FILE})
+  set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
+  set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_DEF_FILE})
 
   set(fq_data_files "")
-  if(ADD_GEN_HDR2_DATA_FILES)
-    foreach(data_file IN LISTS ADD_GEN_HDR2_DATA_FILES)
+  if(ADD_GEN_HDR_DATA_FILES)
+    foreach(data_file IN LISTS ADD_GEN_HDR_DATA_FILES)
       list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}")
     endforeach(data_file)
   endif()
@@ -118,7 +118,7 @@ function(add_gen_header2 target_name)
             ${entry_points}
             --output_dir ${out_file}
     DEPENDS ${yaml_file} ${def_file} ${fq_data_files}
-    COMMENT "Generating header ${ADD_GEN_HDR2_GEN_HDR} from ${yaml_file} and ${def_file}"
+    COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file} and ${def_file}"
   )
   if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
@@ -136,132 +136,6 @@ function(add_gen_header2 target_name)
     )
   endif()
 
-  if(ADD_GEN_HDR2_DEPENDS)
-    get_fq_deps_list(fq_deps_list ${ADD_GEN_HDR2_DEPENDS})
-    # Dependencies of a add_header target can only be another add_gen_header target
-    # or an add_header target.
-    foreach(dep IN LISTS fq_deps_list)
-      get_target_property(header_file ${dep} HEADER_FILE_PATH)
-      if(NOT header_file)
-        message(FATAL_ERROR "Invalid dependency '${dep}' for '${fq_target_name}'.")
-      endif()
-    endforeach()
-  endif()
-  set(generated_hdr_target ${fq_target_name}.__generated_hdr__)
-  add_custom_target(
-    ${generated_hdr_target}
-    DEPENDS ${out_file} ${fq_deps_list} ${decl_out_file}
-  )
-
-  add_header_library(
-    ${target_name}
-    HDRS
-      ${out_file}
-  )
-
-  add_dependencies(${fq_target_name} ${generated_hdr_target})
-
-  set_target_properties(
-    ${fq_target_name}
-    PROPERTIES
-      HEADER_FILE_PATH ${out_file}
-      DECLS_FILE_PATH "${decl_out_file}"
-      DEPS "${fq_deps_list}"
-  )
-
-
-endfunction(add_gen_header2)
-
-# Usage:
-#     add_gen_header(
-#       <target name>
-#       DEF_FILE <.h.def file>
-#       GEN_HDR <generated header file name>
-#       PARAMS <list of name=value pairs>
-#       DATA_FILES <list input data files>
-#     )
-function(add_gen_header target_name)
-  cmake_parse_arguments(
-    "ADD_GEN_HDR"
-    "PUBLIC" # No optional arguments
-    "DEF_FILE;GEN_HDR" # Single value arguments
-    "PARAMS;DATA_FILES;DEPENDS"     # Multi value arguments
-    ${ARGN}
-  )
-  get_fq_target_name(${target_name} fq_target_name)
-  if(NOT LLVM_LIBC_FULL_BUILD)
-    # We don't want to use generated headers if we are doing a non-full-build.
-    add_library(${fq_target_name} INTERFACE)
-    return()
-  endif()
-  if(NOT ADD_GEN_HDR_DEF_FILE)
-    message(FATAL_ERROR "`add_gen_hdr` rule requires DEF_FILE to be specified.")
-  endif()
-  if(NOT ADD_GEN_HDR_GEN_HDR)
-    message(FATAL_ERROR "`add_gen_hdr` rule requires GEN_HDR to be specified.")
-  endif()
-
-  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_GEN_HDR})
-  file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
-  set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
-  set(in_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_DEF_FILE})
-
-  set(fq_data_files "")
-  if(ADD_GEN_HDR_DATA_FILES)
-    foreach(data_file IN LISTS ADD_GEN_HDR_DATA_FILES)
-      list(APPEND fq_data_files "${CMAKE_CURRENT_SOURCE_DIR}/${data_file}")
-    endforeach(data_file)
-  endif()
-
-  set(replacement_params "")
-  if(ADD_GEN_HDR_PARAMS)
-    list(APPEND replacement_params "--args" ${ADD_GEN_HDR_PARAMS})
-  endif()
-
-  set(gen_hdr_script "${LIBC_BUILD_SCRIPTS_DIR}/gen_hdr.py")
-
-  file(GLOB td_includes ${LIBC_SOURCE_DIR}/spec/*.td)
-
-  set(ENTRYPOINT_NAME_LIST_ARG ${TARGET_ENTRYPOINT_NAME_LIST})
-  list(TRANSFORM ENTRYPOINT_NAME_LIST_ARG PREPEND "--e=")
-
-  if(LIBC_HDRGEN_EXE)
-    set(hdrgen_exe ${LIBC_HDRGEN_EXE})
-  else()
-    set(hdrgen_exe ${LIBC_TABLEGEN_EXE})
-    set(hdrgen_deps "${LIBC_TABLEGEN_EXE};${LIBC_TABLEGEN_TARGET}")
-  endif()
-  add_custom_command(
-    OUTPUT ${out_file}
-    COMMAND ${hdrgen_exe} -o ${out_file} --header ${ADD_GEN_HDR_GEN_HDR}
-            --def ${in_file} ${replacement_params} -I ${LIBC_SOURCE_DIR}
-           ${ENTRYPOINT_NAME_LIST_ARG}
-           ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td
-
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    DEPENDS ${in_file} ${fq_data_files} ${td_includes}
-            ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td
-            ${hdrgen_deps}
-  )
-
-  if(LIBC_TARGET_OS_IS_GPU)
-    file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
-    file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls/gpu)
-    set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
-    add_custom_command(
-      OUTPUT ${decl_out_file}
-      COMMAND ${hdrgen_exe} -o ${decl_out_file}
-              --header ${ADD_GEN_HDR_GEN_HDR} --def ${in_file} --export-decls
-              ${replacement_params} -I ${LIBC_SOURCE_DIR} ${ENTRYPOINT_NAME_LIST_ARG}
-              ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td
-
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      DEPENDS ${in_file} ${fq_data_files} ${td_includes}
-              ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td
-              ${hdrgen_deps}
-    )
-  endif()
-
   if(ADD_GEN_HDR_DEPENDS)
     get_fq_deps_list(fq_deps_list ${ADD_GEN_HDR_DEPENDS})
     # Dependencies of a add_header target can only be another add_gen_header target
@@ -285,9 +159,6 @@ function(add_gen_header target_name)
       ${out_file}
   )
 
-  # We add the dependencies separately and not list under add_header_library's
-  # DEPENDS option above. This is because, deps of add_header_library are
-  # used with target_link_libraries.
   add_dependencies(${fq_target_name} ${generated_hdr_target})
 
   set_target_properties(
@@ -297,4 +168,6 @@ function(add_gen_header target_name)
       DECLS_FILE_PATH "${decl_out_file}"
       DEPS "${fq_deps_list}"
   )
+
+
 endfunction(add_gen_header)
diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
deleted file mode 100644
index 7421d86fabeb0..0000000000000
--- a/libc/config/baremetal/api.td
+++ /dev/null
@@ -1,59 +0,0 @@
-include "config/public_api.td"
-
-include "spec/stdc.td"
-include "spec/stdc_ext.td"
-include "spec/bsd_ext.td"
-include "spec/llvm_libc_stdfix_ext.td"
-
-
-def CTypeAPI : PublicAPI<"ctype.h"> {
-}
-
-def FEnvAPI : PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t", "fexcept_t"];
-}
-
-def IntTypesAPI : PublicAPI<"inttypes.h"> {
-  let Types = ["imaxdiv_t"];
-}
-
-def MathAPI : PublicAPI<"math.h"> {
-  let Types = ["double_t", "float_t"];
-}
-
-def StdIOAPI : PublicAPI<"stdio.h"> {
-  let Types = ["size_t"];
-}
-
-def StdlibAPI : PublicAPI<"stdlib.h"> {
-  let Types = [
-    "div_t",
-    "ldiv_t",
-    "lldiv_t",
-    "size_t",
-    "__bsearchcompare_t",
-    "__qsortcompare_t",
-  ];
-}
-
-def StringAPI : PublicAPI<"string.h"> {
-  let Types = ["size_t"];
-}
-
-def TimeAPI : PublicAPI<"time.h"> {
-  let Types = [
-    "clock_t",
-    "time_t",
-    "struct tm",
-    "struct timespec",
-  ];
-}
-
-def UCharAPI : PublicAPI<"uchar.h"> {
-  let Types = [
-    "mbstate_t",
-    "char8_t",
-    "char16_t",
-    "char32_t",
-  ];
-}
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
deleted file mode 100644
index 995ff31c4ac9e..0000000000000
--- a/libc/config/gpu/api.td
+++ /dev/null
@@ -1,50 +0,0 @@
-include "config/public_api.td"
-
-include "spec/stdc.td"
-include "spec/posix.td"
-include "spec/gpu_ext.td"
-include "spec/gnu_ext.td"
-include "spec/stdc_ext.td"
-include "spec/llvm_libc_ext.td"
-
-
-def StringAPI : PublicAPI<"string.h"> {
-  let Types = ["size_t"];
-}
-
-def StdlibAPI : PublicAPI<"stdlib.h"> {
-  let Types = [
-    "div_t",
-    "ldiv_t",
-    "lldiv_t",
-    "size_t",
-    "__bsearchcompare_t",
-    "__qsortcompare_t",
-    "__qsortrcompare_t",
-    "__atexithandler_t",
-  ];
-}
-
-def FenvAPI: PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t"];
-}
-
-def StdIOAPI : PublicAPI<"stdio.h"> {
-  let Types = [
-    "FILE",
-    "off_t",
-    "size_t",
-  ];
-}
-
-def IntTypesAPI : PublicAPI<"inttypes.h"> {
-  let Types = ["imaxdiv_t"];
-}
-
-def TimeAPI : PublicAPI<"time.h"> {
-  let Types = [
-    "clock_t",
-    "time_t",
-    "struct timespec",
-  ];
-}
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
deleted file mode 100644
index a00e0f61b90df..0000000000000
--- a/libc/config/linux/api.td
+++ /dev/null
@@ -1,276 +0,0 @@
-include "config/public_api.td"
-
-include "spec/stdc.td"
-include "spec/posix.td"
-include "spec/linux.td"
-include "spec/gnu_ext.td"
-include "spec/bsd_ext.td"
-include "spec/stdc_ext.td"
-include "spec/llvm_libc_ext.td"
-include "spec/llvm_libc_stdfix_ext.td"
-
-def CTypeAPI : PublicAPI<"ctype.h"> {
-}
-
-def FCntlAPI : PublicAPI<"fcntl.h"> {
-  let Types = [
-    "mode_t",
-    "off_t",
-  ];
-}
-
-def IntTypesAPI : PublicAPI<"inttypes.h"> {
-  let Types = ["imaxdiv_t"];
-}
-
-def MathAPI : PublicAPI<"math.h"> {
-  let Types = ["double_t", "float_t", "float128"];
-}
-
-def FenvAPI: PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t", "fexcept_t"];
-}
-
-def StringAPI : PublicAPI<"string.h"> {
-  let Types = ["size_t"];
-}
-
-def StdIOAPI : PublicAPI<"stdio.h"> {
-  let Macros = [
-    SimpleMacroDef<"stderr", "stderr">,
-    SimpleMacroDef<"stdin", "stdin">,
-    SimpleMacroDef<"stdout", "stdout">,
-  ];
-  let Types = [
-    "FILE",
-    "cookie_io_functions_t",
-    "off_t",
-    "size_t",
-  ];
-}
-
-def StdlibAPI : PublicAPI<"stdlib.h"> {
-  let Types = [
-    "div_t",
-    "ldiv_t",
-    "lldiv_t",
-    "size_t",
-    "__bsearchcompare_t",
-    "__qsortcompare_t",
-    "__qsortrcompare_t",
-    "__atexithandler_t",
-  ];
-}
-
-def TimeAPI : PublicAPI<"time.h"> {
-  let Types = [
-    "clock_t",
-    "time_t",
-    "struct tm",
-    "struct timespec",
-    "struct timeval",
-    "clockid_t",
-  ];
-}
-
-def SchedAPI : PublicAPI<"sched.h"> {
-  let Types = [
-    "pid_t",
-    "size_t",
-    "cpu_set_t",
-    "struct sched_param",
-    // Needed according to posix standard
-    "time_t",
-    "struct timespec",
-  ];
-}
-
-def SysMManAPI : PublicAPI<"sys/mman.h"> {
-  let Types = ["off_t", "size_t", "mode_t"];
-}
-
-def SignalAPI : PublicAPI<"signal.h"> {
-  let Types = [
-    "sig_atomic_t",
-    "sigset_t",
-    "struct sigaction",
-    "union sigval",
-    "siginfo_t",
-    "stack_t",
-    "pid_t",
-  ];
-}
-
-def ThreadsAPI : PublicAPI<"threads.h"> {
-  let Macros = [
-    SimpleMacroDef<"ONCE_FLAG_INIT", "{0}">,
-  ];
-
-  let Types = [
-    "__call_once_func_t",
-    "once_flag",
-    "cnd_t",
-    "mtx_t",
-    "thrd_t",
-    "thrd_start_t",
-    "tss_t",
-    "tss_dtor_t",
-  ];
-
-  let Enumerations = [
-    "mtx_plain",
-    "mtx_recursive",
-    "mtx_timed",
-    "thrd_timedout",
-    "thrd_success",
-    "thrd_busy",
-    "thrd_error",
-    "thrd_nomem",
-  ];
-}
-
-def PThreadAPI : PublicAPI<"pthread.h"> {
-  let Types = [
-      "__atfork_callback_t",
-      "__pthread_once_func_t",
-      "__pthread_start_t",
-      "__pthread_tss_dtor_t",
-      "pthread_attr_t",
-      "pthread_condattr_t",
-      "pthread_key_t",
-      "pthread_mutex_t",
-      "pthread_mutexattr_t",
-      "pthread_once_t",
-      "pthread_rwlockattr_t",
-      "pthread_rwlock_t",
-      "pthread_spinlock_t",
-      "pthread_t",
-  ];
-}
-
-def DirentAPI : PublicAPI<"dirent.h"> {
-  let Types = [
-    "ino_t",
-    "DIR",
-    "struct dirent",
-  ];
-}
-
-def UniStdAPI : PublicAPI<"unistd.h"> {
-  let Types = ["__exec_argv_t", "__exec_envp_t", "off_t", "pid_t", "size_t",
-               "ssize_t", "uid_t", "__getoptargv_t"];
-}
-
-def WCharAPI : PublicAPI<"wchar.h"> {
-  let Types = [
-    "wchar_t",
-    "wint_t",
-    "size_t",
-  ];
-}
-
-def UCharAPI : PublicAPI<"uchar.h"> {
-  let Types = [
-    "mbstate_t",
-    "char8_t",
-    "char16_t",
-    "char32_t",
-  ];
-}
-
-def SysRandomAPI : PublicAPI<"sys/random.h"> {
-  let Types = ["size_t", "ssize_t"];
-}
-
-def SysSelectAPI : PublicAPI<"sys/select.h"> {
-  let Types = ["fd_set", "sigset_t", "suseconds_t", "time_t", "struct timespec",
-               "struct timeval"];
-}
-
-def SysSocketAPI : PublicAPI<"sys/socket.h"> {
-  let Types = [
-    "sa_family_t",
-    "socklen_t",
-    "struct sockaddr",
-    "struct sockaddr_un",
-    "struct msghdr",
-    "struct iovec",
-    "size_t",
-    "ssize_t",
-  ];
-}
-
-def SysResourceAPI : PublicAPI<"sys/resource.h"> {
-  let Types = ["rlim_t", "struct rlimit"];
-}
-
-def SysStatAPI : PublicAPI<"sys/stat.h"> {
-  let Types = ["mode_t", "dev_t", "ino_t", "nlink_t", "uid_t", "gid_t", "off_t",
-               "struct timespec", "struct timeval", "blksize_t", "blkcnt_t",
-               "struct stat"];
-}
-
-def SysWaitAPI : PublicAPI<"sys/wait.h"> {
-  let Types = ["pid_t", "struct rusage", "siginfo_t"];
-}
-
-def SysSendfileAPI : PublicAPI<"sys/sendfile.h"> {
-  let Types = ["off_t", "size_t", "ssize_t"];
-}
-
-def SysTypesAPI : PublicAPI<"sys/types.h"> {
-  let Types = [
-    "blkcnt_t",
-    "blksize_t",
-    "clockid_t",
-    "dev_t",
-    "gid_t",
-    "ino_t",
-    "mode_t",
-    "nlink_t",
-    "off_t",
-    "pid_t",
-    "pthread_attr_t",
-    "pthread_condattr_t",
-    "pthread_key_t",
-    "pthread_mutex_t",
-    "pthread_mutexattr_t",
-    "pthread_once_t",
-    "pthread_rwlockattr_t",
-    "pthread_rwlock_t",
-    "pthread_t",
-    "size_t",
-    "ssize_t",
-    "suseconds_t",
-    "time_t",
-    "uid_t"
-  ];
-}
-
-def SysUtsNameAPI : PublicAPI<"sys/utsname.h"> {
-  let Types = ["struct utsname"];
-}
-
-def SysEpollAPI : PublicAPI<"sys/epoll.h"> {
-  let Types = ["struct epoll_event", "struct epoll_data", "sigset_t", "struct timespec"];
-}
-
-def SpawnAPI : PublicAPI<"spawn.h"> {
-  let Types = ["mode_t", "pid_t", "posix_spawnattr_t", "posix_spawn_file_actions_t"];
-}
-
-def TermiosAPI : PublicAPI<"termios.h"> {
-  let Types = ["cc_t", "pid_t", "speed_t", "struct termios", "tcflag_t"];
-}
-
-def SetJmpAPI : PublicAPI<"setjmp.h"> {
-  let Types = ["jmp_buf"];
-}
-
-def SearchAPI : PublicAPI<"search.h"> {
-  let Types = ["ACTION", "ENTRY", "struct hsearch_data", "__lsearchcompare_t"];
-}
-
-def SysStatvfsAPI : PublicAPI<"sys/statvfs.h"> {
-  let Types = ["struct statvfs"];
-}
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index af7f429561fe0..5e9cc71279ab1 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -700,6 +700,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
     libc.src.math.setpayloadsigf16
+    libc.src.math.sinf16
     libc.src.math.sinhf16
     libc.src.math.sinpif16
     libc.src.math.sqrtf16
diff --git a/libc/config/public_api.td b/libc/config/public_api.td
deleted file mode 100644
index 1b34506c643c3..0000000000000
--- a/libc/config/public_api.td
+++ /dev/null
@@ -1,26 +0,0 @@
-include "spec/spec.td"
-
-class MacroDef<string name> {
-  string Name = name;
-  string Defn = "";
-}
-
-class SimpleMacroDef<string name, string value> : MacroDef<name> {
-  let Defn = !strconcat("#define ", name, " ", value);
-}
-
-class MacroDefineIfNot<string name, string value> : MacroDef<name> {
-  let Defn = !strconcat("#ifndef ", name, "\n",
-                        "#define " , name, " ", value, "\n",
-                        "#endif // ", name);
-}
-
-class PublicAPI<string name> {
-  string HeaderName = name;
-  list<MacroDef> Macros = [];
-  list<string> Types = [];
-  list<string> Enumerations = [];
-  list<string> Structs = [];
-  list<string> Functions = [];
-  list<string> Objects = [];
-}
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index ec4206217ca77..0730b9a40c26a 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -38,9 +38,6 @@ Required Versions:
    ``build/projects/libc/include/sys``.
 
 
-New Headergen is turned on by default, but if you want to use old Headergen,
-you can include this statement when building: ``-DLIBC_USE_NEW_HEADER_GEN=OFF``
-
 To add a function to the YAML files, you can either manually enter it in the
 YAML file corresponding to the header it belongs to or add it through the
 command line.
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index 37dccdab6dc34..88643575ae4d9 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -63,10 +63,6 @@ targeting the default host environment as well.
 Runtimes cross build
 --------------------
 
-.. note::
-  These instructions need to be updated for new headergen. They may be
-  inaccurate.
-
 For users wanting more direct control over the build process, the build steps
 can be done manually instead. This build closely follows the instructions in the
 :ref:`main documentation<full_cross_build>` but is specialized for the GPU
@@ -82,20 +78,17 @@ compiler. These tools must all be up-to-date with the libc source.
   $> HOST_CXX_COMPILER=<C++ compiler for the host> # For example "clang++"
   $> cmake ../llvm                            \
      -G Ninja                                 \
-     -DLLVM_ENABLE_PROJECTS="clang;libc"      \
+     -DLLVM_ENABLE_PROJECTS="clang"           \
      -DCMAKE_C_COMPILER=$HOST_C_COMPILER      \
      -DCMAKE_CXX_COMPILER=$HOST_CXX_COMPILER  \
      -DLLVM_LIBC_FULL_BUILD=ON                \
-     -DLIBC_HDRGEN_ONLY=ON    \ # Only build the 'libc-hdrgen' tool
      -DCMAKE_BUILD_TYPE=Release # Release suggested to make "clang" fast
   $> ninja # Build the 'clang' compiler
-  $> ninja libc-hdrgen # Build the 'libc-hdrgen' tool
 
-Once this has finished the build directory should contain the ``clang`` compiler
-and the ``libc-hdrgen`` executable. We will use the ``clang`` compiler to build
-the GPU code and the ``libc-hdrgen`` tool to create the necessary headers. We
-use these tools to bootstrap the build out of the runtimes directory targeting a
-GPU architecture.
+Once this has finished the build directory should contain the ``clang``
+compiler executable. We will use the ``clang`` compiler to build the GPU code.
+We use these tools to bootstrap the build out of the runtimes directory
+targeting a GPU architecture.
 
 .. code-block:: sh
 
@@ -105,7 +98,6 @@ GPU architecture.
   $> TARGET_TRIPLE=<amdgcn-amd-amdhsa or nvptx64-nvidia-cuda>
   $> TARGET_C_COMPILER=</path/to/clang>
   $> TARGET_CXX_COMPILER=</path/to/clang++>
-  $> HDRGEN=</path/to/libc-hdrgen>
   $> cmake ../runtimes \ # Point to the runtimes build
      -G Ninja                                  \
      -DLLVM_ENABLE_RUNTIMES=libc               \
@@ -113,7 +105,6 @@ GPU architecture.
      -DCMAKE_CXX_COMPILER=$TARGET_CXX_COMPILER \
      -DLLVM_LIBC_FULL_BUILD=ON                 \
      -DLLVM_RUNTIMES_TARGET=$TARGET_TRIPLE     \
-     -DLIBC_HDRGEN_EXE=$HDRGEN                 \
      -DCMAKE_BUILD_TYPE=Release
   $> ninja install
 
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 2b86f49a3619e..4934e93ccb164 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -336,7 +336,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | rsqrt     |                  |                 |                        |                      |                        | 7.12.7.9               | F.10.4.9                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sin       | |check|          | |check|         |                        |                      |                        | 7.12.4.6               | F.10.1.6                   |
+| sin       | |check|          | |check|         |                        | |check|              |                        | 7.12.4.6               | F.10.1.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sincos    | |check|          | |check|         |                        |                      |                        |                        |                            |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 899a93ad72d4c..7fc67141996ec 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -17,25 +17,17 @@ add_header(
     __llvm-libc-common.h
 )
 
+# TODO: Can we simplify this macro expansion?
+# https://github.com/llvm/llvm-project/issues/117254
 macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS)
-  if (LIBC_USE_NEW_HEADER_GEN)
-    add_gen_header2(
-      ${TARGET_NAME}
-      YAML_FILE ${YAML_FILE}
-      DEF_FILE ${DEF_FILE}
-      GEN_HDR ${GEN_HDR}
-      ${DEPENDS}
-      ${ARGN}
-    )
-  else()
-    add_gen_header(
-      ${TARGET_NAME}
-      DEF_FILE ${DEF_FILE}
-      GEN_HDR ${GEN_HDR}
-      ${DEPENDS}
-      ${ARGN}
-    )
-  endif()
+  add_gen_header(
+    ${TARGET_NAME}
+    YAML_FILE ${YAML_FILE}
+    DEF_FILE ${DEF_FILE}
+    GEN_HDR ${GEN_HDR}
+    ${DEPENDS}
+    ${ARGN}
+  )
 endmacro()
 
 add_header_macro(
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index e09f0929e45f8..00efc34789667 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -2339,6 +2339,13 @@ functions:
     return_type: float
     arguments:
       - type: float
+  - name: sinf16
+    standards:
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
   - name: sinhf
     standards:
       - stdc
diff --git a/libc/spec/bsd_ext.td b/libc/spec/bsd_ext.td
deleted file mode 100644
index 2b91324e36db9..0000000000000
--- a/libc/spec/bsd_ext.td
+++ /dev/null
@@ -1,87 +0,0 @@
-def BsdExtensions : StandardSpec<"BSDExtensions"> {
-  HeaderSpec Math = HeaderSpec<
-      "math.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<"isnan", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"isnanf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"isnanl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
-      ]
-  >;
-
-  HeaderSpec String = HeaderSpec<
-      "string.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "strlcat",
-            RetValSpec<SizeTType>,
-            [ArgSpec<CharRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strlcpy",
-            RetValSpec<SizeTType>,
-            [ArgSpec<CharRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strsep",
-            RetValSpec<CharPtr>,
-            [ArgSpec<CharRestrictedPtrPtr>, ArgSpec<ConstCharRestrictedPtr>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Strings = HeaderSpec<
-      "strings.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "strcasecmp",
-            RetValSpec<IntType>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-            "strncasecmp",
-            RetValSpec<IntType>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "index",
-            RetValSpec<CharPtr>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "rindex",
-            RetValSpec<CharPtr>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-      ]
-  >;
-
-  HeaderSpec SysWait = HeaderSpec<
-      "sys/wait.h",
-      [], // Macros
-      [StructRUsage], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "wait4",
-            RetValSpec<PidT>,
-            [ArgSpec<PidT>, ArgSpec<IntPtr>, ArgSpec<IntType>, ArgSpec<StructRUsagePtr>]
-        >
-      ]
-  >;
-
-  let Headers = [
-    Math,
-    String,
-    Strings,
-    SysWait,
-  ];
-}
diff --git a/libc/spec/gnu_ext.td b/libc/spec/gnu_ext.td
deleted file mode 100644
index 64121aed9574f..0000000000000
--- a/libc/spec/gnu_ext.td
+++ /dev/null
@@ -1,316 +0,0 @@
-def CpuSetT : NamedType<"cpu_set_t">;
-def CpuSetPtr : PtrType<CpuSetT>;
-def ConstCpuSetPtr : ConstType<CpuSetPtr>;
-
-def QSortRCompareT : NamedType<"__qsortrcompare_t">;
-def StructHsearchData : NamedType<"struct hsearch_data">;
-def StructHsearchDataPtr : PtrType<StructHsearchData>;
-
-def GnuExtensions : StandardSpec<"GNUExtensions"> {
-  NamedType CookieIOFunctionsT = NamedType<"cookie_io_functions_t">;
-  HeaderSpec CType = HeaderSpec<
-    "ctype.h",
-    [], // Macros
-    [], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "toascii",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-    ]
-  >;
-
-  HeaderSpec Malloc = HeaderSpec<
-      "malloc.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<"mallopt", RetValSpec<IntType>, [ArgSpec<IntType>, ArgSpec<IntType>]>,
-      ]
-  >;
-
-  HeaderSpec Math = HeaderSpec<
-      "math.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "sincosf",
-            RetValSpec<VoidType>,
-            [ArgSpec<FloatType>, ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]
-        >,
-        FunctionSpec<
-            "lgamma_r",
-            RetValSpec<DoubleType>,
-            [ArgSpec<DoubleType>, ArgSpec<IntPtr>]
-        >,
-        FunctionSpec<
-            "lgammaf_r",
-            RetValSpec<FloatType>,
-            [ArgSpec<FloatType>, ArgSpec<IntPtr>]
-        >,
-        FunctionSpec<
-            "lgammal_r",
-            RetValSpec<LongDoubleType>,
-            [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Sched = HeaderSpec<
-      "sched.h",
-      [], // Macros
-      [PidT, SizeTType, CpuSetT], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "sched_getaffinity",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<SizeTType>, ArgSpec<CpuSetPtr>]
-        >,
-        FunctionSpec<
-            "sched_setaffinity",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<SizeTType>, ArgSpec<ConstCpuSetPtr>]
-        >,
-      ]
-  >;
-  HeaderSpec String = HeaderSpec<
-      "string.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-         FunctionSpec<
-            "memmem",
-            RetValSpec<VoidPtr>,
-            [ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "memrchr",
-            RetValSpec<VoidPtr>,
-            [ArgSpec<ConstVoidPtr>, ArgSpec<IntType>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strerror_r",
-            RetValSpec<CharPtr>,
-            [ArgSpec<IntType>, ArgSpec<CharPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strcasestr",
-            RetValSpec<CharPtr>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-            "strchrnul",
-            RetValSpec<CharPtr>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Search = HeaderSpec<
-    "search.h",
-    [], // Macros
-    [
-        StructHsearchData
-    ],
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "hcreate_r",
-            RetValSpec<IntType>,
-            [
-                ArgSpec<SizeTType>,
-                ArgSpec<StructHsearchDataPtr>
-            ]
-        >,
-        FunctionSpec<
-            "hdestroy_r",
-            RetValSpec<VoidType>,
-            [
-                ArgSpec<StructHsearchDataPtr>
-            ]
-        >,
-        FunctionSpec<
-            "hsearch_r",
-            RetValSpec<IntType>,
-            [
-                ArgSpec<EntryType>,
-                ArgSpec<ActionType>,
-                ArgSpec<EntryTypePtrPtr>,
-                ArgSpec<StructHsearchDataPtr>
-            ]
-        >,
-    ]
-  >;
-
-  HeaderSpec FEnv = HeaderSpec<
-      "fenv.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "fedisableexcept",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "feenableexcept",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "fegetexcept",
-            RetValSpec<IntType>,
-            []
-        >
-      ]
-  >;
-
-  HeaderSpec StdIO = HeaderSpec<
-      "stdio.h",
-      [], // Macros
-      [CookieIOFunctionsT], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "clearerr_unlocked",
-              RetValSpec<VoidType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "feof_unlocked",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "ferror_unlocked",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "fopencookie",
-              RetValSpec<FILEPtr>,
-              [ArgSpec<VoidPtr>, ArgSpec<ConstCharPtr>, ArgSpec<CookieIOFunctionsT>]
-          >,
-          FunctionSpec<
-              "fread_unlocked",
-              RetValSpec<SizeTType>,
-              [ArgSpec<VoidRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<SizeTType>,
-               ArgSpec<FILERestrictedPtr>]
-          >,
-          FunctionSpec<
-              "fwrite_unlocked",
-              RetValSpec<SizeTType>,
-              [ArgSpec<ConstVoidRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<SizeTType>,
-               ArgSpec<FILERestrictedPtr>]
-          >,
-          FunctionSpec<
-              "fgetc_unlocked",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec StdLib = HeaderSpec<
-      "stdlib.h",
-      [], // Macros
-      [QSortRCompareT], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-                "qsort_r",
-                RetValSpec<VoidType>,
-                [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<SizeTType>, ArgSpec<QSortRCompareT>, ArgSpec<VoidPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec PThread = HeaderSpec<
-      "pthread.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "pthread_setname_np",
-              RetValSpec<IntType>,
-              [ArgSpec<PThreadTType>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "pthread_getname_np",
-              RetValSpec<IntType>,
-              [ArgSpec<PThreadTType>, ArgSpec<CharPtr>, ArgSpec<SizeTType>]
-          >,
-      ]
-  >;
-
-  HeaderSpec SysAuxv = HeaderSpec<
-      "sys/auxv.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "getauxval",
-              RetValSpec<UnsignedLongType>,
-              [ArgSpec<UnsignedLongType>]
-          >,
-      ]  // Functions
-  >;
-
-  HeaderSpec SendFile = HeaderSpec<
-      "sys/sendfile.h",
-      [], // Macros
-      [OffTType, SizeTType, SSizeTType,], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "sendfile",
-              RetValSpec<SSizeTType>,
-              [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<OffTPtr>, ArgSpec<SizeTType>]
-          >,
-      ]
-  >;
-
-  HeaderSpec UniStd = HeaderSpec<
-    "unistd.h",
-    [], // Macros
-    [], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "dup2",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<IntType>]
-        >,
-    ]
-  >;
-
-  let Headers = [
-    CType,
-    FEnv,
-    Malloc,
-    Math,
-    PThread,
-    Sched,
-    SendFile,
-    SysAuxv,
-    StdIO,
-    StdLib,
-    String,
-    Search,
-    UniStd,
-  ];
-}
diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td
deleted file mode 100644
index d99531dc06bcd..0000000000000
--- a/libc/spec/gpu_ext.td
+++ /dev/null
@@ -1,18 +0,0 @@
-def GPUExtensions : StandardSpec<"GPUExtensions"> {
-  HeaderSpec RPC = HeaderSpec<
-    "gpu/rpc.h",
-    [], // Macros
-    [], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "rpc_host_call",
-            RetValSpec<UnsignedLongLongType>,
-            [ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-    ]
-  >;
-  let Headers = [
-    RPC,
-  ];
-}
diff --git a/libc/spec/linux.td b/libc/spec/linux.td
deleted file mode 100644
index 99e0949a592df..0000000000000
--- a/libc/spec/linux.td
+++ /dev/null
@@ -1,334 +0,0 @@
-def StructEpollEvent : NamedType<"struct epoll_event">;
-def StructEpollEventPtr : PtrType<StructEpollEvent>;
-
-def StructEpollData : NamedType<"struct epoll_data">;
-
-def Linux : StandardSpec<"Linux"> {
-  HeaderSpec Errno = HeaderSpec<
-      "errno.h",
-      [
-        Macro<"ENOMEDIUM">,
-        Macro<"ENOTBLK">,
-        Macro<"EMEDIUMTYPE">,
-        Macro<"EBADSLT">,
-        Macro<"ECHRNG">,
-        Macro<"ERFKILL">,
-        Macro<"EUSERS">,
-        Macro<"EBADR">,
-        Macro<"EL3HLT">,
-        Macro<"ENOTUNIQ">,
-        Macro<"EXFULL">,
-        Macro<"EHOSTDOWN">,
-        Macro<"EL3RST">,
-        Macro<"ENOPKG">,
-        Macro<"ENOCSI">,
-        Macro<"EUNATCH">,
-        Macro<"EREMCHG">,
-        Macro<"ETOOMANYREFS">,
-        Macro<"EL2HLT">,
-        Macro<"EBADFD">,
-        Macro<"EREMOTEIO">,
-        Macro<"ENAVAIL">,
-        Macro<"ELIBEXEC">,
-        Macro<"ESHUTDOWN">,
-        Macro<"ENOKEY">,
-        Macro<"ESTRPIPE">,
-        Macro<"EKEYREJECTED">,
-        Macro<"ESRMNT">,
-        Macro<"EKEYREVOKED">,
-        Macro<"EBADE">,
-        Macro<"ELIBBAD">,
-        Macro<"EISNAM">,
-        Macro<"EBFONT">,
-        Macro<"EPFNOSUPPORT">,
-        Macro<"EREMOTE">,
-        Macro<"EDEADLOCK">,
-        Macro<"ENONET">,
-        Macro<"EDOTDOT">,
-        Macro<"EKEYEXPIRED">,
-        Macro<"ELIBSCN">,
-        Macro<"ERESTART">,
-        Macro<"EBADRQC">,
-        Macro<"EUCLEAN">,
-        Macro<"ENOANO">,
-        Macro<"ELIBACC">,
-        Macro<"EHWPOISON">,
-        Macro<"ELIBMAX">,
-        Macro<"ESOCKTNOSUPPORT">,
-        Macro<"ENOTNAM">,
-        Macro<"ELNRNG">,
-        Macro<"EL2NSYNC">,
-        Macro<"EADV">,
-        Macro<"ECOMM">,
-      ]
-  >;
-
-  HeaderSpec Sched = HeaderSpec<
-      "sched.h",
-      [
-        Macro<"SCHED_OTHER">,
-        Macro<"SCHED_FIFO">,
-        Macro<"SCHED_RR">,
-        Macro<"SCHED_BATCH">,
-        Macro<"SCHED_ISO">,
-        Macro<"SCHED_IDLE">,
-        Macro<"SCHED_DEADLINE">,
-      ],
-      [], // Types
-      [], // Enumerations
-      []  // Functions
-  >;
-
-  HeaderSpec SysMMan = HeaderSpec<
-      "sys/mman.h",
-      [Macro<"MAP_ANONYMOUS">],
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "mincore",
-            RetValSpec<IntType>,
-            [
-              ArgSpec<VoidPtr>,
-              ArgSpec<SizeTType>,
-              ArgSpec<UnsignedCharPtr>,
-            ]
-        >,
-        FunctionSpec<
-          "mlock2",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<VoidPtr>,
-            ArgSpec<SizeTType>,
-            ArgSpec<UnsignedIntType>,
-          ]
-        >,
-        FunctionSpec<
-          "remap_file_pages",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<VoidPtr>,
-            ArgSpec<SizeTType>,
-            ArgSpec<IntType>,
-            ArgSpec<SizeTType>,
-            ArgSpec<IntType>,
-        FunctionSpec<
-          "process_mrelease",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<UnsignedIntType>
-          ]
-        >,
-        FunctionSpec<
-          "mremap",
-          RetValSpec<VoidPtr>,
-          [
-            ArgSpec<VoidPtr>,
-            ArgSpec<SizeTType>,
-            ArgSpec<SizeTType>,
-            ArgSpec<IntType>,
-            ArgSpec<VarArgType>,
-          ]
-        >,
-      ]  // Functions
-  >;
-
-
-  HeaderSpec SysPrctl = HeaderSpec<
-      "sys/prctl.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "prctl",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<UnsignedLongType>,
-            ArgSpec<UnsignedLongType>,
-            ArgSpec<UnsignedLongType>,
-            ArgSpec<UnsignedLongType>,
-          ]
-        >,
-      ]  // Functions
-  >;
-
-  HeaderSpec SysRandom = HeaderSpec<
-      "sys/random.h",
-      [
-        Macro<"GRND_RANDOM">,
-        Macro<"GRND_NONBLOCK">,
-        Macro<"GRND_INSECURE">,
-      ],
-      [SizeTType, SSizeTType], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "getrandom",
-          RetValSpec<SSizeTType>,
-          [
-            ArgSpec<VoidPtr>,
-            ArgSpec<SizeTType>,
-            ArgSpec<UnsignedIntType>
-          ]
-        >,
-      ]
-  >;
-
-  HeaderSpec SysTime = HeaderSpec<
-      "sys/time.h",
-      [
-        Macro<"timeradd">,
-        Macro<"timersub">,
-        Macro<"timerclear">,
-        Macro<"timerisset">,
-        Macro<"timercmp">,
-      ],
-      [StructTimevalType], // Types
-      [], // Enumerations
-      []  // Functions
-  >;
-
-
-  HeaderSpec SysEpoll = HeaderSpec<
-      "sys/epoll.h",
-      [], // Macros
-      [
-        StructEpollEvent,
-        StructEpollData,
-        SigSetType,
-        StructTimeSpec,
-      ], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "epoll_create",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>
-          ]
-        >,
-        FunctionSpec<
-          "epoll_create1",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>
-          ]
-        >,
-        FunctionSpec<
-          "epoll_ctl",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<IntType>,
-            ArgSpec<IntType>,
-            ArgSpec<StructEpollEventPtr>
-          ]
-        >,
-        FunctionSpec<
-          "epoll_wait",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<StructEpollEventPtr>,
-            ArgSpec<IntType>,
-            ArgSpec<IntType>
-          ]
-        >,
-        FunctionSpec<
-          "epoll_pwait",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<StructEpollEventPtr>,
-            ArgSpec<IntType>,
-            ArgSpec<IntType>,
-            ArgSpec<ConstSigSetPtrType>
-          ]
-        >,
-        FunctionSpec<
-          "epoll_pwait2",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<StructEpollEventPtr>,
-            ArgSpec<IntType>,
-            ArgSpec<ConstStructTimeSpecPtr>,
-            ArgSpec<ConstSigSetPtrType>
-          ]
-        >,
-      ]  // Functions
-  >;
-
-  HeaderSpec Signal = HeaderSpec<
-      "signal.h",
-      [
-        Macro<"NSIG">,
-
-        Macro<"SIGHUP">,
-        Macro<"SIGINT">,
-        Macro<"SIGQUIT">,
-        Macro<"SIGILL">,
-        Macro<"SIGTRAP">,
-        Macro<"SIGABRT">,
-        Macro<"SIGIOT">,
-        Macro<"SIGBUS">,
-        Macro<"SIGFPE">,
-        Macro<"SIGKILL">,
-        Macro<"SIGUSR1">,
-        Macro<"SIGSEGV">,
-        Macro<"SIGUSR2">,
-        Macro<"SIGPIPE">,
-        Macro<"SIGALRM">,
-        Macro<"SIGTERM">,
-        Macro<"SIGSTKFLT">,
-        Macro<"SIGCHLD">,
-        Macro<"SIGCONT">,
-        Macro<"SIGSTOP">,
-        Macro<"SIGTSTP">,
-        Macro<"SIGTTIN">,
-        Macro<"SIGTTOU">,
-        Macro<"SIGURG">,
-        Macro<"SIGXCPU">,
-        Macro<"SIGXFSZ">,
-        Macro<"SIGVTALRM">,
-        Macro<"SIGPROF">,
-        Macro<"SIGWINCH">,
-        Macro<"SIGIO">,
-        Macro<"SIGPOLL">,
-        Macro<"SIGPWR">,
-        Macro<"SIGSYS">,
-        Macro<"SIGUNUSED">,
-      ]
-  >;
-
-
-  HeaderSpec UniStd = HeaderSpec<
-    "unistd.h",
-    [], // Macros
-    [],
-    [], // Enumerations
-    [
-        FunctionSpec<
-          "pipe2",
-          RetValSpec<IntType>,
-          [ArgSpec<IntPtr>, ArgSpec<IntType>] //TODO: make this int[2]
-        >,
-    ],
-    []
-  >;
-  
-
-  let Headers = [
-    Errno,
-    SysEpoll,
-    SysMMan,
-    SysPrctl,
-    SysRandom,
-    SysTime,
-    Signal,
-    UniStd,
-  ];
-}
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
deleted file mode 100644
index cd63e34a44ef0..0000000000000
--- a/libc/spec/llvm_libc_ext.td
+++ /dev/null
@@ -1,116 +0,0 @@
-def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
-  HeaderSpec Strings = HeaderSpec<
-      "strings.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "bcopy",
-              RetValSpec<VoidType>,
-              [ArgSpec<ConstVoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "bzero",
-              RetValSpec<VoidType>,
-              [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "bcmp",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstVoidPtr>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Assert = HeaderSpec<
-      "assert.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "__assert_fail",
-              RetValSpec<NoReturn>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<UnsignedType>, ArgSpec<ConstCharPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Sched = HeaderSpec<
-      "sched.h",
-      [], // Macros
-      [PidT, SizeTType, CpuSetT], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "__sched_getcpucount",
-            RetValSpec<IntType>,
-            [ArgSpec<SizeTType>, ArgSpec<ConstCpuSetPtr>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Math = HeaderSpec<
-      "math.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          GuardedFunctionSpec<"daddf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"ddivf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"dfmaf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"dsqrtf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"dsubf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-
-          GuardedFunctionSpec<"f16add", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16addf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16addl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"f16sub", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16subf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16subl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          
-          GuardedFunctionSpec<"faddf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"fdivf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"ffmaf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"fmulf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"dmulf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"f16mul", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16mulf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16mull", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"f16div", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16divf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16divl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"f16fma", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16fmal", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"f16sqrt", RetValSpec<Float16Type>, [ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16sqrtl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          GuardedFunctionSpec<"fsqrtf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"fsubf128", RetValSpec<FloatType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"powi", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
-          FunctionSpec<"powif", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
-      ]
-  >;
-
-  let Headers = [
-    Assert,
-    Math,
-    Sched,
-    Strings,
-  ];
-}
diff --git a/libc/spec/llvm_libc_stdfix_ext.td b/libc/spec/llvm_libc_stdfix_ext.td
deleted file mode 100644
index 7bc7ec5464081..0000000000000
--- a/libc/spec/llvm_libc_stdfix_ext.td
+++ /dev/null
@@ -1,27 +0,0 @@
-def LLVMLibcStdfixExt : StandardSpec<"llvm_libc_stdfix_ext"> {
-  HeaderSpec StdFix = HeaderSpec<
-      "stdfix.h",
-      [],  // macros
-      [],  // types
-      [],  // enums
-      [    // functions
-          GuardedFunctionSpec<"exphk", RetValSpec<ShortAccumType>, [ArgSpec<ShortAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"expk", RetValSpec<AccumType>, [ArgSpec<AccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"sqrtuhr", RetValSpec<UnsignedShortFractType>, [ArgSpec<UnsignedShortFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"sqrtur", RetValSpec<UnsignedFractType>, [ArgSpec<UnsignedFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"sqrtulr", RetValSpec<UnsignedLongFractType>, [ArgSpec<UnsignedLongFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"sqrtuhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"sqrtuk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"sqrtulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"uhksqrtus", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"uksqrtui", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedIntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-      ]
-  >;
-
-  let Headers = [
-    StdFix,
-  ];
-}
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
deleted file mode 100644
index e354deef340f1..0000000000000
--- a/libc/spec/posix.td
+++ /dev/null
@@ -1,1867 +0,0 @@
-def SigSetType : NamedType<"sigset_t">;
-def SigSetPtrType : PtrType<SigSetType>;
-def ConstSigSetPtrType : ConstType<SigSetPtrType>;
-def RestrictedSigSetType : RestrictedPtrType<SigSetType>;
-def ConstRestrictedSigSetType : ConstType<RestrictedSigSetType>;
-
-def SigInfoType : NamedType<"siginfo_t">;
-def UnionSigVal : NamedType<"union sigval">;
-
-def StructSigaction : NamedType<"struct sigaction">;
-def StructSigactionPtr : PtrType<StructSigaction>;
-def ConstStructSigactionPtr : ConstType<StructSigactionPtr>;
-def RestrictedStructSigactionPtr : RestrictedPtrType<StructSigaction>;
-def ConstRestrictedStructSigactionPtr : ConstType<RestrictedStructSigactionPtr>;
-
-def PThreadStartT : NamedType<"__pthread_start_t">;
-def PThreadTSSDtorT : NamedType<"__pthread_tss_dtor_t">;
-def PThreadKeyT : NamedType<"pthread_key_t">;
-def PThreadKeyTPtr : PtrType<PThreadKeyT>;
-def PThreadOnceT : NamedType<"pthread_once_t">;
-def PThreadOnceTPtr : PtrType<PThreadOnceT>;
-def PThreadOnceCallback : NamedType<"__pthread_once_func_t">;
-
-def InoT : NamedType<"ino_t">;
-def UidT : NamedType<"uid_t">;
-def GidT : NamedType<"gid_t">;
-def DevT : NamedType<"dev_t">;
-def ClockIdT : NamedType<"clockid_t">;
-def RestrictedClockIdTPtr : RestrictedPtrType<ClockIdT>;
-def BlkSizeT : NamedType<"blksize_t">;
-def BlkCntT : NamedType<"blkcnt_t">;
-def NLinkT : NamedType<"nlink_t">;
-
-def StatType : NamedType<"struct stat">;
-def StatTypePtr : PtrType<StatType>;
-def RestrictedStatTypePtr : RestrictedPtrType<StatType>;
-
-def DIR : NamedType<"DIR">;
-def DIRPtr : PtrType<DIR>;
-def DIRRestrictedPtr : RestrictedPtrType<DIR>;
-def StructDirent : NamedType<"struct dirent">;
-def StructDirentPtr : PtrType<StructDirent>;
-def StructDirentPtrPtr : PtrType<StructDirentPtr>;
-def ConstStructDirentPtrPtr : ConstType<StructDirentPtrPtr>;
-
-def StructSchedParam : NamedType<"struct sched_param">;
-def StructSchedParamPtr : PtrType<StructSchedParam>;
-def ConstStructSchedParamPtr : ConstType<StructSchedParamPtr>;
-
-def ExecArgvT : NamedType<"__exec_argv_t">;
-def ExecEnvpT : NamedType<"__exec_envp_t">;
-
-def AtForkCallbackT : NamedType<"__atfork_callback_t">;
-
-def PosixSpawnFileActionsT : NamedType<"posix_spawn_file_actions_t">;
-def PosixSpawnFileActionsTPtr : PtrType<PosixSpawnFileActionsT>;
-def ConstPosixSpawnFileActionsTPtr : ConstType<PosixSpawnFileActionsTPtr>;
-def PosixSpawnFileActionsTRestrictedPtr : RestrictedPtrType<PosixSpawnFileActionsT>;
-
-def PosixSpawnAttrT : NamedType<"posix_spawnattr_t">;
-def RestrictedPosixSpawnAttrTPtrType : RestrictedPtrType<PosixSpawnAttrT>;
-
-def CcT : NamedType<"cc_t">;
-def SpeedT : NamedType<"speed_t">;
-def StructTermios : NamedType<"struct termios">;
-def StructTermiosPtr : PtrType<StructTermios>;
-def ConstStructTermiosPtr : ConstType<StructTermiosPtr>;
-def TcFlagT : NamedType<"tcflag_t">;
-
-def StackT : NamedType<"stack_t">;
-def StackTPtr : PtrType<StackT>;
-def RestrictedStackTPtr : RestrictedPtrType<StackT>;
-def ConstRestrictedStackTPtr : ConstType<RestrictedStackTPtr>;
-
-def FdSet : NamedType<"fd_set">;
-def FdSetPtr : PtrType<FdSet>;
-def RestrictedFdSetPtr : RestrictedPtrType<FdSet>;
-
-def GetoptArgvT : NamedType<"__getoptargv_t">;
-
-def SAFamilyType : NamedType<"sa_family_t">;
-def SocklenType : NamedType<"socklen_t">;
-def SocklenPtr : PtrType<SocklenType>;
-
-def StructSockAddr : NamedType<"struct sockaddr">;
-def StructSockAddrPtr : PtrType<StructSockAddr>;
-def ConstStructSockAddrPtr : ConstType<StructSockAddrPtr>;
-
-def StructMsghdr : NamedType<"struct msghdr">;
-def StructMsghdrPtr : PtrType<StructMsghdr>;
-def ConstStructMsghdrPtr : ConstType<StructMsghdrPtr>;
-
-def StructIovec : NamedType<"struct iovec">;
-def StructIovecPtr : PtrType<StructIovec>;
-def ConstStructIovecPtr : ConstType<StructIovecPtr>;
-
-def StructSockAddrUn : NamedType<"struct sockaddr_un">;
-
-def StructStatvfs : NamedType<"struct statvfs">;
-def StructStatvfsPtr : PtrType<StructStatvfs>;
-def RestrictedStructStatvfsPtr : RestrictedPtrType<StructStatvfs>;
-
-// The function pointer type for the predicate for lsearch, lfind
-def LSearchCompareT : NamedType<"__lsearchcompare_t">;
-
-def POSIX : StandardSpec<"POSIX"> {
-  PtrType CharPtr = PtrType<CharType>;
-  RestrictedPtrType RestrictedCharPtr = RestrictedPtrType<CharType>;
-  RestrictedPtrType CharRestrictedDoublePtr = RestrictedPtrType<CharPtr>;
-  ConstType ConstCharPtr = ConstType<CharPtr>;
-  ConstType ConstRestrictedCharPtr = ConstType<RestrictedCharPtr>;
-
-  NamedType ModeTType = NamedType<"mode_t">;
-
-  NamedType PThreadAttrTType = NamedType<"pthread_attr_t">;
-  PtrType PThreadAttrTPtr = PtrType<PThreadAttrTType>;
-  RestrictedPtrType RestrictedPThreadAttrTPtr = RestrictedPtrType<PThreadAttrTType>;
-  ConstType ConstPThreadAttrTPtr = ConstType<PThreadAttrTPtr>;
-  ConstType ConstRestrictedPThreadAttrTPtr = ConstType<RestrictedPThreadAttrTPtr>;
-
-  NamedType PThreadCondAttrTType = NamedType<"pthread_condattr_t">;
-  PtrType PThreadCondAttrTPtr = PtrType<PThreadCondAttrTType>;
-  ConstType ConstRestrictedPThreadCondAttrTPtr = ConstType<RestrictedPtrType<PThreadCondAttrTType>>;
-
-  NamedType PThreadRWLockAttrTType = NamedType<"pthread_rwlockattr_t">;
-  PtrType PThreadRWLockAttrTPtr = PtrType<PThreadRWLockAttrTType>;
-  ConstType ConstPThreadRWLockAttrTPtr = ConstType<PThreadRWLockAttrTPtr>;
-  RestrictedPtrType RestrictedPThreadRWLockAttrTPtr = RestrictedPtrType<PThreadRWLockAttrTType>;
-  ConstType ConstRestrictedPThreadRWLockAttrTPtr = ConstType<RestrictedPThreadRWLockAttrTPtr>;
-
-  NamedType PThreadMutexAttrTType = NamedType<"pthread_mutexattr_t">;
-  PtrType PThreadMutexAttrTPtr = PtrType<PThreadMutexAttrTType>;
-  RestrictedPtrType RestrictedPThreadMutexAttrTPtr = RestrictedPtrType<PThreadMutexAttrTType>;
-  ConstType ConstPThreadMutexAttrTPtr = ConstType<PThreadMutexAttrTPtr>;
-  ConstType ConstRestrictedPThreadMutexAttrTPtr = ConstType<RestrictedPThreadMutexAttrTPtr>;
-
-  NamedType PThreadMutexTType = NamedType<"pthread_mutex_t">;
-  PtrType PThreadMutexTPtr = PtrType<PThreadMutexTType>;
-  RestrictedPtrType RestrictedPThreadMutexTPtr = RestrictedPtrType<PThreadMutexTType>;
-  ConstType ConstPThreadMutexTPtr = ConstType<PThreadMutexTPtr>;
-  ConstType ConstRestrictedPThreadMutexTPtr = ConstType<RestrictedPThreadMutexTPtr>;
-
-  NamedType PThreadRWLockTType = NamedType<"pthread_rwlock_t">;
-  PtrType PThreadRWLockTPtr = PtrType<PThreadRWLockTType>;
-  RestrictedPtrType RestrictedPThreadRWLockTPtr = RestrictedPtrType<PThreadRWLockTType>;
-
-  NamedType PThreadSpinLockTType = NamedType<"pthread_spinlock_t">;
-  PtrType PThreadSpinLockTPtr = PtrType<PThreadSpinLockTType>;
-
-  PtrType PThreadTPtr = PtrType<PThreadTType>;
-  RestrictedPtrType RestrictedPThreadTPtr = RestrictedPtrType<PThreadTType>;
-
-  HeaderSpec Errno = HeaderSpec<
-      "errno.h",
-      [
-        Macro<"E2BIG">,
-        Macro<"EACCES">,
-        Macro<"EADDRINUSE">,
-        Macro<"EADDRNOTAVAIL">,
-        Macro<"EAFNOSUPPORT">,
-        Macro<"EAGAIN">,
-        Macro<"EALREADY">,
-        Macro<"EBADF">,
-        Macro<"EBADMSG">,
-        Macro<"EBUSY">,
-        Macro<"ECANCELED">,
-        Macro<"ECHILD">,
-        Macro<"ECONNABORTED">,
-        Macro<"ECONNREFUSED">,
-        Macro<"ECONNRESET">,
-        Macro<"EDEADLK">,
-        Macro<"EDESTADDRREQ">,
-        Macro<"EDQUOT">,
-        Macro<"EEXIST">,
-        Macro<"EFAULT">,
-        Macro<"EFBIG">,
-        Macro<"EHOSTUNREACH">,
-        Macro<"EIDRM">,
-        Macro<"EINPROGRESS">,
-        Macro<"EINTR">,
-        Macro<"EINVAL">,
-        Macro<"EIO">,
-        Macro<"EISCONN">,
-        Macro<"EISDIR">,
-        Macro<"ELOOP">,
-        Macro<"EMFILE">,
-        Macro<"EMLINK">,
-        Macro<"EMSGSIZE">,
-        Macro<"EMULTIHOP">,
-        Macro<"ENAMETOOLONG">,
-        Macro<"ENETDOWN">,
-        Macro<"ENETRESET">,
-        Macro<"ENETUNREACH">,
-        Macro<"ENFILE">,
-        Macro<"ENOBUFS">,
-        Macro<"ENODATA">,
-        Macro<"ENODEV">,
-        Macro<"ENOENT">,
-        Macro<"ENOEXEC">,
-        Macro<"ENOLCK">,
-        Macro<"ENOLINK">,
-        Macro<"ENOMEM">,
-        Macro<"ENOMSG">,
-        Macro<"ENOPROTOOPT">,
-        Macro<"ENOSPC">,
-        Macro<"ENOSR">,
-        Macro<"ENOSTR">,
-        Macro<"ENOSYS">,
-        Macro<"ENOTCONN">,
-        Macro<"ENOTDIR">,
-        Macro<"ENOTEMPTY">,
-        Macro<"ENOTRECOVERABLE">,
-        Macro<"ENOTSOCK">,
-        Macro<"ENOTSUP">,
-        Macro<"ENOTTY">,
-        Macro<"ENXIO">,
-        Macro<"EOPNOTSUPP">,
-        Macro<"EOVERFLOW">,
-        Macro<"EOWNERDEAD">,
-        Macro<"EPERM">,
-        Macro<"EPIPE">,
-        Macro<"EPROTO">,
-        Macro<"EPROTONOSUPPORT">,
-        Macro<"EPROTOTYPE">,
-        Macro<"EROFS">,
-        Macro<"ESPIPE">,
-        Macro<"ESRCH">,
-        Macro<"ESTALE">,
-        Macro<"ETIME">,
-        Macro<"ETIMEDOUT">,
-        Macro<"ETXTBSY">,
-        Macro<"EWOULDBLOCK">,
-        Macro<"EXDEV">,
-      ],
-      [], // Types
-      [], // Enumerations
-      []  // Functions
-  >;
-
-  HeaderSpec DlFcn = HeaderSpec<
-    "dlfcn.h",
-    [
-      Macro<"RTLD_LAZY">,
-      Macro<"RTLD_NOW">,
-      Macro<"RTLD_GLOBAL">,
-      Macro<"RTLD_LOCAL">,
-    ],
-    [],  // Types
-    [], // Enumerations
-    [
-      FunctionSpec<
-          "dlclose",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidPtr>]
-      >,
-      FunctionSpec<
-          "dlerror",
-          RetValSpec<CharPtr>,
-          []
-      >,
-      FunctionSpec<
-          "dlopen",
-          RetValSpec<VoidPtr>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "dlsym",
-          RetValSpec<VoidPtr>,
-          [ArgSpec<VoidRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>]
-      >,
-    ]
-  >;
-
-  HeaderSpec FCntl = HeaderSpec<
-    "fcntl.h",
-    [], // Macros
-    [
-        ModeTType,
-        OffTType,
-    ],
-    [], // Enumerations
-    [
-      FunctionSpec<
-          "creat",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>]
-      >,
-      FunctionSpec<
-          "fcntl",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<VarArgType>]
-      >,
-      FunctionSpec<
-          "open",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<VarArgType>]
-      >,
-      FunctionSpec<
-          "openat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<VarArgType>]
-      >,
-    ]
-  >;
-
-  HeaderSpec SysMMan = HeaderSpec<
-      "sys/mman.h",
-      [
-        // TODO: Add a facility to bunch macros into bitwise-or-able groups.
-        // POSIX requires it, so such thing should be captured in this spec.
-        Macro<"PROT_EXEC">,
-        Macro<"PROT_NONE">,
-        Macro<"PROT_READ">,
-        Macro<"PROT_WRITE">,
-
-        Macro<"MAP_FIXED">,
-        Macro<"MAP_PRIVATE">,
-        Macro<"MAP_SHARED">,
-
-        Macro<"MAP_FAILED">,
-      ],
-      [
-        SizeTType,
-        OffTType,
-        ModeTType,
-      ],
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "madvise",
-            RetValSpec<IntType>,
-            [ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>,
-             ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "mmap",
-            RetValSpec<VoidPtr>,
-            [ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>,
-             ArgSpec<IntType>,
-             ArgSpec<IntType>,
-             ArgSpec<IntType>,
-             ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-            "mprotect",
-            RetValSpec<IntType>,
-            [ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>,
-             ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "munmap",
-            RetValSpec<IntType>,
-            [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "posix_madvise",
-            RetValSpec<IntType>,
-            [ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>,
-             ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "mlock",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "munlock",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "mlockall",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "munlockall",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "msync",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "shm_open",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ModeTType>]
-        >,
-        FunctionSpec<
-          "shm_unlink",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Signal = HeaderSpec<
-      "signal.h",
-      [], // Macros
-      [
-        SigInfoType,
-        SigSetType,
-        StackT,
-        StructSigaction,
-        UnionSigVal,
-        PidT,
-      ],
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "kill",
-          RetValSpec<IntType>,
-          [ArgSpec<PidT>,
-            ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "sigaction",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>,
-           ArgSpec<ConstRestrictedStructSigactionPtr>,
-           ArgSpec<RestrictedStructSigactionPtr>]
-        >,
-        FunctionSpec<
-          "sigaltstack",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedStackTPtr>, ArgSpec<RestrictedStackTPtr>]
-        >,
-        FunctionSpec<
-          "sigdelset",
-          RetValSpec<IntType>,
-          [ArgSpec<SigSetPtrType>,
-           ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "sigprocmask",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstRestrictedSigSetType>, ArgSpec<RestrictedSigSetType>]
-        >,
-        FunctionSpec<
-          "sigemptyset",
-          RetValSpec<IntType>,
-          [ArgSpec<SigSetPtrType>]
-        >,
-        FunctionSpec<
-          "sigaddset",
-          RetValSpec<IntType>,
-          [ArgSpec<SigSetPtrType>,
-           ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "sigfillset",
-          RetValSpec<IntType>,
-          [ArgSpec<SigSetPtrType>]
-        >,
-      ]
-  >;
-
-  HeaderSpec UniStd = HeaderSpec<
-    "unistd.h",
-    [], // Macros
-    [
-      ExecArgvT,
-      ExecEnvpT,
-      OffTType,
-      SSizeTType,
-      SizeTType,
-      PidT,
-      UidT,
-      GetoptArgvT,
-    ],
-    [], // Enumerations
-    [
-        FunctionSpec<
-          "_exit",
-          RetValSpec<NoReturn>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "access",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "chdir",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "dup",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "dup2",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "dup3",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "fchdir",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "getcwd",
-          RetValSpec<CharPtr>,
-          [ArgSpec<CharPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "close",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "execv",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ExecArgvT>]
-        >,
-        FunctionSpec<
-          "execve",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ExecArgvT>, ArgSpec<ExecEnvpT>]
-        >,
-        FunctionSpec<
-          "fork",
-          RetValSpec<PidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "fsync",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "ftruncate",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "geteuid",
-          RetValSpec<UidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "getpid",
-          RetValSpec<PidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "getppid",
-          RetValSpec<PidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "gettid",
-          RetValSpec<PidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "getuid",
-          RetValSpec<UidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "isatty",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "link",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "linkat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "lseek",
-          RetValSpec<OffTType>,
-          [ArgSpec<IntType>, ArgSpec<OffTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "pread",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "pwrite",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "read",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "readlink",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "readlinkat",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "rmdir",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "getpid",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "getppid",
-          RetValSpec<IntType>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-          "link",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "linkat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "lseek",
-          RetValSpec<OffTType>,
-          [ArgSpec<IntType>, ArgSpec<OffTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "pipe",
-          RetValSpec<IntType>,
-          [ArgSpec<IntPtr>] //TODO: make this int[2]
-        >,
-        FunctionSpec<
-          "pread",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "pwrite",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "read",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "readlink",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "readlinkat",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "rmdir",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "swab",
-          RetValSpec<VoidType>,
-          [ArgSpec<ConstVoidRestrictedPtr>, ArgSpec<VoidPtr>, ArgSpec<SSizeTType>]
-        >,
-        FunctionSpec<
-          "symlink",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "symlinkat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "sysconf",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "__llvm_libc_syscall",
-          RetValSpec<LongType>,
-          [ArgSpec<LongType>,ArgSpec<LongType>,ArgSpec<LongType>,ArgSpec<LongType>,ArgSpec<LongType>,ArgSpec<LongType>,ArgSpec<LongType>]
-        >,
-        FunctionSpec<
-          "truncate",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<OffTType>]
-        >,
-        FunctionSpec<
-          "unlink",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-        FunctionSpec<
-          "unlinkat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "write",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-          "getopt",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<GetoptArgvT>, ArgSpec<ConstCharPtr>]
-        >,
-    ],
-    [
-        ObjectSpec<"environ", "char **">,
-        ObjectSpec<
-          "optarg",
-          "char *"
-        >,
-        ObjectSpec<
-          "optind",
-          "int"
-        >,
-        ObjectSpec<
-          "opterr",
-          "int"
-        >,
-        ObjectSpec<
-          "optopt",
-          "int"
-        >,
-    ]
-  >;
-
-  HeaderSpec StdLib = HeaderSpec<
-    "stdlib.h",
-    [], // Macros
-    [], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-          "getenv",
-          RetValSpec<CharPtr>,
-          [ArgSpec<ConstCharPtr>]
-        >,
-    ]
-  >;
-
-  HeaderSpec Sched = HeaderSpec<
-      "sched.h",
-      [], // Macros
-      [PidT, TimeTType, StructTimeSpec, StructSchedParam], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-            "sched_yield",
-            RetValSpec<IntType>,
-            [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
-            "sched_setparam",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<ConstStructSchedParamPtr>]
-        >,
-
-        FunctionSpec<
-            "sched_getparam",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<StructSchedParamPtr>]
-        >,
-
-        FunctionSpec<
-            "sched_setscheduler",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>]
-        >,
-
-        FunctionSpec<
-            "sched_getscheduler",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<IntType>, ArgSpec<ConstStructSchedParamPtr>]
-        >,
-
-        FunctionSpec<
-            "sched_get_priority_min",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-
-        FunctionSpec<
-            "sched_get_priority_max",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-
-        FunctionSpec<
-            "sched_rr_get_interval",
-            RetValSpec<IntType>,
-            [ArgSpec<PidT>, ArgSpec<StructTimeSpecPtr>]
-        >,
-      ]
-  >;
-
-  HeaderSpec String = HeaderSpec<
-    "string.h",
-    [
-        Macro<"NULL">,
-    ],
-    [
-        SizeTType,
-    ],
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "memccpy",
-            RetValSpec<VoidPtr>,
-            [ArgSpec<VoidRestrictedPtr>,
-             ArgSpec<ConstVoidRestrictedPtr>,
-             ArgSpec<IntType>,
-             ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "mempcpy",
-            RetValSpec<VoidPtr>,
-            [ArgSpec<VoidRestrictedPtr>,
-             ArgSpec<ConstVoidRestrictedPtr>,
-             ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "stpcpy",
-            RetValSpec<CharPtr>,
-            [ArgSpec<RestrictedCharPtr>,
-             ArgSpec<ConstRestrictedCharPtr>]
-        >,
-        FunctionSpec<
-            "stpncpy",
-            RetValSpec<CharPtr>,
-            [ArgSpec<RestrictedCharPtr>,
-             ArgSpec<ConstRestrictedCharPtr>,
-             ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strnlen",
-             RetValSpec<SizeTType>,
-             [ArgSpec<ConstCharPtr>, ArgSpec<SizeTType>]
-        >,
-        FunctionSpec<
-            "strtok_r",
-            RetValSpec<CharPtr>,
-            [ArgSpec<RestrictedCharPtr>,
-             ArgSpec<ConstRestrictedCharPtr>,
-             ArgSpec<CharRestrictedDoublePtr>]
-        >,
-        FunctionSpec<
-            "strsignal",
-            RetValSpec<CharPtr>,
-            [ArgSpec<IntType>]
-        >,
-    ]
-  >;
-
-  HeaderSpec CType = HeaderSpec<
-    "ctype.h",
-    [], // Macros
-    [], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "isascii",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>]
-        >,
-    ]
-  >;
-
-  NamedType RLimTType = NamedType<"rlim_t">;
-  NamedType StructRLimitType = NamedType<"struct rlimit">;
-  PtrType StructRLimitPtr = PtrType<StructRLimitType>;
-  ConstType ConstStructRLimitPtr = ConstType<StructRLimitType>;
-  HeaderSpec SysResource = HeaderSpec<
-    "sys/resource.h",
-    [], // Macros
-    [RLimTType, StructRLimitType], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-          "getrlimit",
-          RetValSpec<IntType>,
-          [ArgSpec<StructRLimitPtr>]
-        >,
-        FunctionSpec<
-          "setrlimit",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstStructRLimitPtr>]
-        >,
-    ]
-  >;
-
-  HeaderSpec SysStat = HeaderSpec<
-    "sys/stat.h",
-    [], // Macros
-    [
-        ModeTType,
-        DevT,
-        InoT,
-        UidT,
-        GidT,
-        StructTimeSpec,
-        StructTimevalType,
-        BlkSizeT,
-        BlkCntT,
-        OffTType,
-        NLinkT,
-        StatType,
-    ], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-          "chmod",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>]
-        >,
-        FunctionSpec<
-          "fchmod",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ModeTType>]
-        >,
-        FunctionSpec<
-          "fchmodat",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-            "fstat",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>, ArgSpec<StatTypePtr>]
-        >,
-        FunctionSpec<
-            "lstat",
-            RetValSpec<IntType>,
-            [ArgSpec<ConstRestrictedCharPtr>, ArgSpec<RestrictedStatTypePtr>]
-        >,
-        FunctionSpec<
-            "mkdir",
-            RetValSpec<IntType>,
-            [ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>]
-        >,
-        FunctionSpec<
-            "mkdirat",
-            RetValSpec<IntType>,
-            [ArgSpec<IntType>, ArgSpec<ConstCharPtr>, ArgSpec<ModeTType>]
-        >,
-        FunctionSpec<
-            "stat",
-            RetValSpec<IntType>,
-            [ArgSpec<ConstRestrictedCharPtr>, ArgSpec<RestrictedStatTypePtr>]
-        >,
-    ]
-  >;
-
-  HeaderSpec SysStatvfs = HeaderSpec<
-      "sys/statvfs.h",
-      [], // Macros
-      [StructStatvfs], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "statvfs",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<ConstRestrictedCharPtr>,
-            ArgSpec<RestrictedStructStatvfsPtr>
-          ]
-        >,
-        FunctionSpec<
-          "fstatvfs",
-          RetValSpec<IntType>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<StructStatvfsPtr>
-          ]
-        >,
-      ]  // Functions
-    >;
-
-  NamedType StructUtsName = NamedType<"struct utsname">;
-  PtrType StructUtsNamePtr = PtrType<StructUtsName>;
-  HeaderSpec SysUtsName = HeaderSpec<
-    "sys/utsname.h",
-    [], // Macros
-    [StructUtsName], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "uname",
-            RetValSpec<IntType>,
-            [ArgSpec<StructUtsNamePtr>]
-        >,
-    ]
-  >;
-
-  HeaderSpec ArpaInet = HeaderSpec<
-      "arpa/inet.h",
-      [], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "htonl",
-              RetValSpec<UInt32Type>,
-              [ArgSpec<UInt32Type>]
-          >,
-          FunctionSpec<
-              "htons",
-              RetValSpec<UInt16Type>,
-              [ArgSpec<UInt16Type>]
-          >,
-          FunctionSpec<
-              "ntohl",
-              RetValSpec<UInt32Type>,
-              [ArgSpec<UInt32Type>]
-          >,
-          FunctionSpec<
-              "ntohs",
-              RetValSpec<UInt16Type>,
-              [ArgSpec<UInt16Type>]
-          >,
-      ]
-  >;
-
-  HeaderSpec PThread = HeaderSpec<
-    "pthread.h",
-    [], // Macros
-    [
-        AtForkCallbackT,
-        ClockIdT,
-        PThreadAttrTType,
-        PThreadCondAttrTType,
-        PThreadKeyT,
-        PThreadMutexAttrTType,
-        PThreadMutexTType,
-        PThreadOnceCallback,
-        PThreadOnceT,
-        PThreadRWLockAttrTType,
-        PThreadRWLockTType,
-        PThreadSpinLockTType,
-        PThreadStartT,
-        PThreadTSSDtorT,
-        PThreadTType,
-    ], // Types
-    [], // Enumerations
-    [
-      FunctionSpec<
-          "pthread_atfork",
-          RetValSpec<IntType>,
-          [ArgSpec<AtForkCallbackT>, ArgSpec<AtForkCallbackT>, ArgSpec<AtForkCallbackT>]
-      >,
-      FunctionSpec<
-          "pthread_attr_init",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_destroy",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_getdetachstate",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstPThreadAttrTPtr>, ArgSpec<IntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_setdetachstate",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_attr_getguardsize",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadAttrTPtr>, ArgSpec<RestrictedSizeTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_setguardsize",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>, ArgSpec<SizeTType>]
-      >,
-      FunctionSpec<
-          "pthread_attr_getstacksize",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadAttrTPtr>, ArgSpec<RestrictedSizeTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_setstacksize",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>, ArgSpec<SizeTType>]
-      >,
-      FunctionSpec<
-          "pthread_attr_getstack",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadAttrTPtr>, ArgSpec<RestrictedVoidPtrPtr>, ArgSpec<RestrictedSizeTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_attr_setstack",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadAttrTPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_destroy",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadCondAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_getclock",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadCondAttrTPtr>, ArgSpec<RestrictedClockIdTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_getpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadCondAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_init",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadCondAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_setclock",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadCondAttrTPtr>, ArgSpec<ClockIdT>]
-      >,
-      FunctionSpec<
-          "pthread_condattr_setpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadCondAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_create",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadTPtr>, ArgSpec<ConstRestrictedPThreadAttrTPtr>, ArgSpec<PThreadStartT>, ArgSpec<VoidPtr>]
-      >,
-      FunctionSpec<
-          "pthread_join",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadTType>, ArgSpec<VoidPtrPtr>]
-      >,
-      FunctionSpec<
-          "pthread_detach",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadTType>]
-      >,
-      FunctionSpec<
-          "pthread_exit",
-          RetValSpec<NoReturn>,
-          [ArgSpec<VoidPtr>]
-      >,
-      FunctionSpec<
-          "pthread_self",
-          RetValSpec<PThreadTType>,
-          [ArgSpec<VoidType>]
-      >,
-      FunctionSpec<
-          "pthread_equal",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadTType>, ArgSpec<PThreadTType>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_init",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadMutexAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_destroy",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadMutexAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_gettype",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_settype",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_getrobust",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_setrobust",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_getpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_setpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_getprotocol",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_setprotocol",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_getprioceiling",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>, ArgSpec<RestrictedIntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutexattr_setprioceiling",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_mutex_init",
-          RetValSpec<IntType>,
-          [ArgSpec<RestrictedPThreadMutexTPtr>, ArgSpec<ConstRestrictedPThreadMutexAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutex_destroy",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadMutexTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutex_lock",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadMutexTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_mutex_unlock",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadMutexTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_key_create",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadKeyTPtr>, ArgSpec<PThreadTSSDtorT>]
-      >,
-      FunctionSpec<
-          "pthread_key_delete",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadKeyT>]
-      >,
-      FunctionSpec<
-          "pthread_getspecific",
-          RetValSpec<VoidPtr>,
-          [ArgSpec<PThreadKeyT>]
-      >,
-      FunctionSpec<
-          "pthread_setspecific",
-          RetValSpec<VoidPtr>,
-          [ArgSpec<PThreadKeyT>, ArgSpec<ConstVoidPtr>]
-      >,
-      FunctionSpec<
-          "pthread_once",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadOnceTPtr>, ArgSpec<PThreadOnceCallback>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_destroy",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadRWLockAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_getkind_np",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadRWLockAttrTPtr>, ArgSpec<IntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_getpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<ConstPThreadRWLockAttrTPtr>, ArgSpec<IntPtr>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_init",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadRWLockAttrTPtr>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_setkind_np",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadRWLockAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-          "pthread_rwlockattr_setpshared",
-          RetValSpec<IntType>,
-          [ArgSpec<PThreadRWLockAttrTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_init",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>, ArgSpec<ConstRestrictedPThreadRWLockAttrTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_tryrdlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_trywrlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_timedrdlock",
-        RetValSpec<IntType>,
-        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_timedwrlock",
-        RetValSpec<IntType>,
-        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_clockrdlock",
-        RetValSpec<IntType>,
-        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_clockwrlock",
-        RetValSpec<IntType>,
-        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_rdlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_wrlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_unlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_rwlock_destroy",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadRWLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_spin_init",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadSpinLockTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "pthread_spin_destroy",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadSpinLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_spin_lock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadSpinLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_spin_trylock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadSpinLockTPtr>]
-      >,
-      FunctionSpec<
-        "pthread_spin_unlock",
-        RetValSpec<IntType>,
-        [ArgSpec<PThreadSpinLockTPtr>]
-      >
-    ]
-  >;
-
-  HeaderSpec StdIO = HeaderSpec<
-      "stdio.h",
-      [], // Macros
-      [OffTType], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "flockfile",
-              RetValSpec<VoidType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "funlockfile",
-              RetValSpec<VoidType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "getc_unlocked",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "getchar_unlocked",
-              RetValSpec<IntType>,
-              [ArgSpec<VoidType>]
-          >,
-          FunctionSpec<
-            "fileno",
-            RetValSpec<IntType>,
-            [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-            "fdopen",
-            RetValSpec<FILEPtr>,
-            [ArgSpec<IntType>, ArgSpec<ConstCharPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Dirent = HeaderSpec<
-      "dirent.h",
-      [], // Macros
-      [InoT, StructDirent, DIR], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "alphasort",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstStructDirentPtrPtr>, ArgSpec<ConstStructDirentPtrPtr>]
-          >,
-          FunctionSpec<
-              "closedir",
-              RetValSpec<IntType>,
-              [ArgSpec<DIRPtr>]
-          >,
-          FunctionSpec<
-              "dirfd",
-              RetValSpec<IntType>,
-              [ArgSpec<DIRPtr>]
-          >,
-          FunctionSpec<
-              "fdopendir",
-              RetValSpec<DIRPtr>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "opendir",
-              RetValSpec<DIRPtr>,
-              [ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "readdir",
-              RetValSpec<StructDirentPtr>,
-              [ArgSpec<DIRPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Time = HeaderSpec<
-      "time.h",
-      [], // Macros
-      [ClockIdT, StructTimeSpec, StructTimevalType], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "clock_gettime",
-              RetValSpec<IntType>,
-              [ArgSpec<ClockIdT>, ArgSpec<StructTimeSpecPtr>]
-          >,
-          FunctionSpec<
-              "gettimeofday",
-              RetValSpec<IntType>,
-              [ArgSpec<StructTimevalPtr>, ArgSpec<VoidPtr>]
-          >,
-          FunctionSpec<
-              "nanosleep",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstStructTimeSpecPtr>, ArgSpec<StructTimeSpecPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec SysWait = HeaderSpec<
-    "sys/wait.h",
-    [], // Macros
-    [PidT, StructRUsage, SigInfoType],
-    [], // Enumerations
-    [
-      FunctionSpec<
-        "wait",
-        RetValSpec<PidT>,
-        [ArgSpec<IntPtr>]
-      >,
-      FunctionSpec<
-        "waitpid",
-        RetValSpec<PidT>,
-        [ArgSpec<PidT>, ArgSpec<IntPtr>, ArgSpec<IntType>]
-      >
-    ]
-  >;
-
-  HeaderSpec SysIOctl = HeaderSpec<
-    "sys/ioctl.h",
-    [
-      Macro<"TIOCGETD">,
-    ],  // Macros
-    [], // Types
-    [], // Enumerations
-    []  // Functions
-  >;
-
-  HeaderSpec Spawn = HeaderSpec<
-    "spawn.h",
-    [], // Macros
-    [ModeTType, PosixSpawnAttrT, PidT, PosixSpawnFileActionsT],
-    [], // Enumerations
-    [
-      FunctionSpec<
-        "posix_spawn_file_actions_addclose",
-        RetValSpec<IntType>,
-        [ArgSpec<PosixSpawnFileActionsTPtr>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "posix_spawn_file_actions_adddup2",
-        RetValSpec<IntType>,
-        [ArgSpec<PosixSpawnFileActionsTPtr>, ArgSpec<IntType>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "posix_spawn_file_actions_addopen",
-        RetValSpec<IntType>,
-        [ArgSpec<PosixSpawnFileActionsTRestrictedPtr>, ArgSpec<IntType>,
-         ArgSpec<ConstCharRestrictedPtr>, ArgSpec<IntType>, ArgSpec<ModeTType>]
-      >,
-      FunctionSpec<
-        "posix_spawn_file_actions_destroy",
-        RetValSpec<IntType>,
-        [ArgSpec<PosixSpawnFileActionsTPtr>]
-      >,
-      FunctionSpec<
-        "posix_spawn_file_actions_init",
-        RetValSpec<IntType>,
-        [ArgSpec<PosixSpawnFileActionsTPtr>]
-      >,
-      FunctionSpec<
-        "posix_spawn",
-        RetValSpec<IntType>,
-        [ArgSpec<RestrictedPidTPtr>, ArgSpec<ConstCharRestrictedPtr>,
-         ArgSpec<PosixSpawnFileActionsTPtr>, ArgSpec<RestrictedPosixSpawnAttrTPtrType>,
-         ArgSpec<ConstCharRestrictedPtrPtr>, ArgSpec<ConstCharRestrictedPtrPtr>]
-      >,
-    ]
-  >;
-
-  HeaderSpec Search = HeaderSpec<
-    "search.h",
-    [], // Macros
-    [
-        ActionType,
-        EntryType,
-        LSearchCompareT,
-    ], // Types
-    [], // Enumerations
-    [
-        FunctionSpec<
-            "hcreate",
-            RetValSpec<IntType>,
-            [
-                ArgSpec<SizeTType>
-            ]
-        >,
-        FunctionSpec<
-            "hdestroy",
-            RetValSpec<VoidType>,
-            [] // Args
-        >,
-        FunctionSpec<
-            "hsearch",
-            RetValSpec<EntryTypePtr>,
-            [
-                ArgSpec<EntryType>,
-                ArgSpec<ActionType>
-            ]
-        >,
-        FunctionSpec<
-            "insque",
-            RetValSpec<VoidType>,
-            [
-                ArgSpec<VoidPtr>,
-                ArgSpec<VoidPtr>
-            ]
-        >,
-        FunctionSpec<
-            "remque",
-            RetValSpec<VoidType>,
-            [
-                ArgSpec<VoidPtr>
-            ]
-        >,
-        FunctionSpec<
-            "lfind",
-            RetValSpec<VoidPtr>,
-            [
-                ArgSpec<ConstVoidPtr>,
-                ArgSpec<ConstVoidPtr>,
-                ArgSpec<SizeTPtr>,
-                ArgSpec<SizeTType>,
-                ArgSpec<LSearchCompareT>
-            ]
-        >
-    ]
-  >;
-
-  HeaderSpec Termios = HeaderSpec<
-    "termios.h",
-    [
-      Macro<"NCCS">,
-    ],
-    [CcT, PidT, SpeedT, StructTermios, TcFlagT], // Types
-    [], // Enumerations
-    [
-      FunctionSpec<
-        "cfgetispeed",
-        RetValSpec<SpeedT>,
-        [ArgSpec<ConstStructTermiosPtr>]
-      >,
-      FunctionSpec<
-        "cfgetospeed",
-        RetValSpec<SpeedT>,
-        [ArgSpec<ConstStructTermiosPtr>]
-      >,
-      FunctionSpec<
-        "cfsetispeed",
-        RetValSpec<SpeedT>,
-        [ArgSpec<StructTermiosPtr>, ArgSpec<SpeedT>]
-      >,
-      FunctionSpec<
-        "cfsetospeed",
-        RetValSpec<SpeedT>,
-        [ArgSpec<StructTermiosPtr>, ArgSpec<SpeedT>]
-      >,
-      FunctionSpec<
-        "tcdrain",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "tcflow",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "tcflush",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "tcgetattr",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>, ArgSpec<StructTermiosPtr>]
-      >,
-      FunctionSpec<
-        "tcgetsid",
-        RetValSpec<PidT>,
-        [ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "tcsendbreak",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>, ArgSpec<IntType>]
-      >,
-      FunctionSpec<
-        "tcsetattr",
-        RetValSpec<IntType>,
-        [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<StructTermiosPtr>]
-      >,
-    ]
-  >;
-
-  HeaderSpec SysSelect = HeaderSpec<
-    "sys/select.h",
-    [], // Macros
-    [FdSet, SigSetType, StructTimevalType, StructTimeSpec, SuSecondsT, TimeTType],
-    [], // Enumerations
-    [
-      FunctionSpec<
-        "select",
-        RetValSpec<IntType>,
-        [
-            ArgSpec<IntType>, ArgSpec<RestrictedFdSetPtr>, ArgSpec<RestrictedFdSetPtr>,
-            ArgSpec<RestrictedFdSetPtr>, ArgSpec<RestrictedStructTimevalPtr>
-        ]
-      >
-    ]
-  >;
-
-  HeaderSpec SysSocket = HeaderSpec<
-      "sys/socket.h",
-      [
-        Macro<"AF_UNSPEC">,
-        Macro<"AF_UNIX">,
-        Macro<"AF_LOCAL">,
-        Macro<"AF_INET">,
-        Macro<"AF_INET6">,
-        Macro<"SOCK_STREAM">,
-        Macro<"SOCK_DGRAM">,
-        Macro<"SOCK_RAW">,
-        Macro<"SOCK_RDM">,
-        Macro<"SOCK_SEQPACKET">,
-        Macro<"SOCK_PACKET">,
-      ], // Macros
-      [
-        SizeTType,
-        SSizeTType,
-        SAFamilyType,
-        StructSockAddr,
-        StructSockAddrUn,
-        SocklenType,
-        StructIovec,
-        StructMsghdr,
-      ], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "socket",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "socketpair",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<IntType>, ArgSpec<IntPtr>]
-        >,
-        FunctionSpec<
-          "bind",
-          RetValSpec<IntType>,
-          [ArgSpec<IntType>, ArgSpec<ConstStructSockAddrPtr>, ArgSpec<SocklenType>]
-        >,
-        FunctionSpec<
-          "send",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "sendto",
-          RetValSpec<SSizeTType>,
-          [
-            ArgSpec<IntType>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>,
-            ArgSpec<IntType>, ArgSpec<ConstStructSockAddrPtr>,
-            ArgSpec<SocklenType>
-          ]
-        >,
-        FunctionSpec<
-          "sendmsg",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<ConstStructMsghdrPtr>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "recv",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<IntType>]
-        >,
-        FunctionSpec<
-          "recvfrom",
-          RetValSpec<SSizeTType>,
-          [
-            ArgSpec<IntType>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>,
-            ArgSpec<IntType>, ArgSpec<StructSockAddrPtr>,
-            ArgSpec<SocklenPtr>
-          ]
-        >,
-        FunctionSpec<
-          "recvmsg",
-          RetValSpec<SSizeTType>,
-          [ArgSpec<IntType>, ArgSpec<StructMsghdrPtr>, ArgSpec<IntType>]
-        >,
-      ]  // Functions
-  >;
-
-  HeaderSpec SysTypes = HeaderSpec<
-    "sys/types.h",
-    [], // Macros
-    [
-      BlkCntT,
-      BlkSizeT,
-      ClockIdT,
-      DevT,
-      GidT,
-      InoT,
-      ModeTType,
-      NLinkT,
-      OffTType,
-      PThreadAttrTType,
-      PThreadCondAttrTType,
-      PThreadKeyT,
-      PThreadMutexAttrTType,
-      PThreadMutexTType,
-      PThreadOnceT,
-      PThreadRWLockAttrTType,
-      PThreadRWLockTType,
-      PThreadTType,
-      PidT,
-      SSizeTType,
-      SizeTType,
-      SuSecondsT,
-      TimeTType,
-      UidT
-    ], // Types
-    [], // Enumerations
-    []  // Functions
-  >;
-
-  let Headers = [
-    ArpaInet,
-    CType,
-    Dirent,
-    DlFcn,
-    Errno,
-    FCntl,
-    PThread,
-    Sched,
-    Signal,
-    Spawn,
-    StdIO,
-    StdLib,
-    SysIOctl,
-    SysMMan,
-    SysResource,
-    SysSelect,
-    SysSocket,
-    SysStat,
-    SysStatvfs,
-    SysTypes,
-    SysUtsName,
-    SysWait,
-    Time,
-    Termios,
-    UniStd,
-    String,
-    Search,
-  ];
-}
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
deleted file mode 100644
index ad9ca76846c28..0000000000000
--- a/libc/spec/spec.td
+++ /dev/null
@@ -1,253 +0,0 @@
-class Type {}
-
-class NamedType<string name> : Type {
-  string Name = name;
-}
-
-class Field<string name, Type type> {
-  string Name = name;
-  Type FieldType = type;
-}
-
-// Class to describe concrete structs specified by a standard.
-class Struct<string name> : NamedType<name> {
-  list<Field> Fields;
-}
-
-class EnumNameValue<string name, string value = "__default_enum_value__"> {
-  string Name = name;
-  string Value = value;
-}
-
-class Enum<string name, list<EnumNameValue> enumerations> : NamedType<name> {
-  list<EnumNameValue> Enumerations = enumerations;
-}
-
-class PtrType<Type type> : Type {
-  Type PointeeType = type;
-}
-
-class ConstType<Type type> : Type {
-  Type UnqualifiedType = type;
-}
-
-class RestrictedPtrType<Type type> : Type {
-  Type PointeeType = type;
-}
-
-// Builtin types.
-def VarArgType : NamedType<"...">;
-def VaListType : NamedType<"va_list">;
-def VoidType : NamedType<"void">;
-def IntType : NamedType<"int">;
-def UnsignedIntType : NamedType<"unsigned int">;
-def LongType : NamedType<"long">;
-def UnsignedLongType : NamedType<"unsigned long">;
-def LongLongType : NamedType<"long long">;
-def UnsignedLongLongType : NamedType<"unsigned long long">;
-def FloatType : NamedType<"float">;
-def DoubleType : NamedType<"double">;
-def LongDoubleType : NamedType<"long double">;
-def CharType : NamedType<"char">;
-def UnsignedCharType : NamedType<"unsigned char">;
-def UnsignedShortType : NamedType<"unsigned short">;
-def BoolType : NamedType<"bool">;
-
-def Float16Type : NamedType<"_Float16">;
-def Float128Type : NamedType<"float128">;
-
-// Common types
-def VoidPtr : PtrType<VoidType>;
-def VoidPtrPtr : PtrType<VoidPtr>;
-def RestrictedVoidPtrPtr : RestrictedPtrType<VoidPtr>;
-def ConstVoidPtr : ConstType<VoidPtr>;
-
-def SizeTType : NamedType<"size_t">;
-def SizeTPtr : PtrType<SizeTType>;
-def RestrictedSizeTPtr : RestrictedPtrType<SizeTType>;
-
-def Char8TType : NamedType<"char8_t">;
-def Char16TType : NamedType<"char16_t">;
-def Char32TType : NamedType<"char32_t">;
-def WCharType : NamedType<"wchar_t">;
-def WIntType : NamedType<"wint_t">;
-
-def LongDoublePtr : PtrType<LongDoubleType>;
-
-def IntMaxTType : NamedType<"intmax_t">;
-def UIntMaxTType : NamedType<"uintmax_t">;
-
-def UInt16Type : NamedType<"uint16_t">;
-def UInt32Type : NamedType<"uint32_t">;
-
-def OffTType : NamedType<"off_t">;
-def OffTPtr : PtrType<OffTType>;
-def SSizeTType : NamedType<"ssize_t">;
-
-// _Noreturn is really not a type, but it is convenient to treat it as a type.
-def NoReturn : NamedType<"_Noreturn void">;
-
-//types moved from stdc.td
-def VoidRestrictedPtr : RestrictedPtrType<VoidType>;
-def ConstVoidRestrictedPtr : ConstType<VoidRestrictedPtr>;
-
-def CharPtr : PtrType<CharType>;
-def ConstCharPtr : ConstType<CharPtr>;
-def CharRestrictedPtr : RestrictedPtrType<CharType>;
-def CharRestrictedPtrPtr : RestrictedPtrType<CharPtr>;
-def ConstCharRestrictedPtr : ConstType<CharRestrictedPtr>;
-def ConstCharRestrictedPtrPtr : PtrType<ConstCharRestrictedPtr>;
-
-def OnceFlagType : NamedType<"once_flag">;
-def OnceFlagTypePtr : PtrType<OnceFlagType>;
-// TODO(sivachandra): Remove this non-standard type when a formal
-// way to describe callable types is available.
-def CallOnceFuncType : NamedType<"__call_once_func_t">;
-def MtxTType : NamedType<"mtx_t">;
-def MtxTTypePtr : PtrType<MtxTType>;
-def CndTType : NamedType<"cnd_t">;
-def CndTTypePtr : PtrType<CndTType>;
-def ThrdStartTType : NamedType<"thrd_start_t">;
-def ThrdTType : NamedType<"thrd_t">;
-def ThrdTTypePtr : PtrType<ThrdTType>;
-
-def IntPtr : PtrType<IntType>;
-def RestrictedIntPtr : RestrictedPtrType<IntType>;
-def FloatPtr : PtrType<FloatType>;
-def DoublePtr : PtrType<DoubleType>;
-def Float16Ptr : PtrType<Float16Type>;
-def Float128Ptr : PtrType<Float128Type>;
-def UnsignedCharPtr : PtrType<UnsignedCharType>;
-
-def ConstDoublePtr : ConstType<DoublePtr>;
-def ConstFloatPtr : ConstType<FloatPtr>;
-def ConstLongDoublePtr : ConstType<LongDoublePtr>;
-def ConstFloat16Ptr : ConstType<Float16Ptr>;
-def ConstFloat128Ptr : ConstType<Float128Ptr>;
-
-def SigHandlerT : NamedType<"__sighandler_t">;
-
-def TimeTType : NamedType<"time_t">;
-
-def StructTimeSpec : NamedType<"struct timespec">;
-def StructTimeSpecPtr : PtrType<StructTimeSpec>;
-def ConstStructTimeSpecPtr : ConstType<StructTimeSpecPtr>;
-def RestrictStructTimeSpecPtr : RestrictedPtrType<StructTimeSpec>;
-def ConstRestrictStructTimeSpecPtr : ConstType<RestrictStructTimeSpecPtr>;
-
-def BSearchCompareT : NamedType<"__bsearchcompare_t">;
-def QSortCompareT : NamedType<"__qsortcompare_t">;
-
-def AtexitHandlerT : NamedType<"__atexithandler_t">;
-
-def FILE : NamedType<"FILE">;
-def FILEPtr : PtrType<FILE>;
-def FILERestrictedPtr : RestrictedPtrType<FILE>;
-
-def PThreadTType : NamedType<"pthread_t">;
-
-def PidT : NamedType<"pid_t">;
-def RestrictedPidTPtr : RestrictedPtrType<PidT>;
-
-def StructRUsage : NamedType<"struct rusage">;
-def StructRUsagePtr : PtrType<StructRUsage>;
-
-def StructTimevalType : NamedType<"struct timeval">;
-def StructTimevalPtr : PtrType<StructTimevalType>;
-def RestrictedStructTimevalPtr : RestrictedPtrType<StructTimevalType>;
-
-def SuSecondsT : NamedType<"suseconds_t">;
-
-//added because __assert_fail needs it.
-def UnsignedType : NamedType<"unsigned">;
-
-def ActionType : NamedType<"ACTION">;
-def EntryType : NamedType<"ENTRY">;
-def EntryTypePtr : PtrType<EntryType>;
-def EntryTypePtrPtr : PtrType<EntryTypePtr>;
-
-def MBStateTType : NamedType<"mbstate_t">;
-
-class Macro<string name> {
-  string Name = name;
-}
-
-class EnumeratedNameValue<string name, string value = "__default__"> {
-  string Name = name;
-  string Value = value;
-}
-
-class Annotation {}
-
-class RetValSpec<Type type, list<Annotation> annotations = []> {
-  Type ReturnType = type;
-  list<Annotation> Annotations = annotations;
-}
-
-class ArgSpec<Type type, list<Annotation> annotations = [], string name = ""> {
-  Type ArgType = type;
-  list<Annotation> Annotations = annotations;
-  string Name = name;
-}
-
-// The following classes are used to describe function attributes.
-// In the future, we may consider supporting parameter attributes as well.
-// https://clang.llvm.org/docs/AttributeReference.html
-class FunctionAttr<string style, string attr> {
-  string Attr = attr;
-  // The style of the attribute, e.g. "gnu", "cxx11", "declspec".
-  // - "gnu" is for GNU-style attributes: __attribute__((...))
-  // - "cxx11" is for C++11-style attributes: [[...]]
-  // - "declspec" is for Microsoft-style attributes: __declspec(...)
-  string Style = style;
-
-  // For the time being, we are only interested in identifer-like attributes.
-  // We can extend this to support function-like attributes if needed.
-  // For example, in the future, we can #define __LIBC_ATTRIBUTE_NODISCARD(...) [[nodiscard(__VA_ARGS__)]]
-  // int FunctionLike = 0;
-}
-class GnuFunctionAttr<string attr> : FunctionAttr<"gnu", attr> {}
-class Cxx11FunctionAttr<string attr, string namespace> : FunctionAttr<"cxx11", attr> {
-  // The namespace of the attribute, e.g. "gnu" or "clang". Empty string means there is no namespace.
-  string Namespace = namespace;
-}
-class DeclspecFunctionAttr<string attr> : FunctionAttr<"declspec", attr> {}
-class FunctionAttrSpec<string macro, list<FunctionAttr> instances> {
-  list<FunctionAttr> Instances = instances;
-  string Macro = macro;
-}
-
-class FunctionSpec<string name, RetValSpec return, list<ArgSpec> args, list<FunctionAttrSpec> attrs = []> {
-  string Name = name;
-  RetValSpec Return = return;
-  list<ArgSpec> Args = args;
-  list<FunctionAttrSpec> Attributes = attrs;
-}
-
-class GuardedFunctionSpec<string name, RetValSpec return, list<ArgSpec> args, string guard_macro> : FunctionSpec<name, return, args> {
-  string Guard = guard_macro;
-}
-
-class ObjectSpec<string name, string type> {
-  string Name = name;
-  string Type = type;
-}
-
-class HeaderSpec<string name,
-                list<Macro> macros = [],
-                list<Type> types = [],
-                list<EnumeratedNameValue> enumerations = [],
-                list<FunctionSpec> functions = [],
-                list<ObjectSpec> objects = []> {
-  string Name = name;
-  list<FunctionSpec> Functions = functions;
-  list<Type> Types = types;
-  list<Macro> Macros = macros;
-  list<EnumeratedNameValue> Enumerations = enumerations;
-  list<ObjectSpec> Objects = objects;
-}
-
-class StandardSpec<string name> {
-  string Name = name;
-  list<HeaderSpec> Headers;
-}
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
deleted file mode 100644
index 493ca1a6440df..0000000000000
--- a/libc/spec/stdc.td
+++ /dev/null
@@ -1,1827 +0,0 @@
-def StdC : StandardSpec<"stdc"> {
-
-  NamedType StructTmType = NamedType<"struct tm">;
-  PtrType StructTmPtr = PtrType<StructTmType>;
-  PtrType TimeTTypePtr = PtrType<TimeTType>;
-  NamedType ClockT = NamedType<"clock_t">;
-  NamedType LocaleT = NamedType<"locale_t">;
-
-  NamedType DivTType = NamedType<"div_t">;
-  NamedType LDivTType = NamedType<"ldiv_t">;
-  NamedType LLDivTType = NamedType<"lldiv_t">;
-
-  NamedType JmpBuf = NamedType<"jmp_buf">;
-
-  NamedType TssTType = NamedType<"tss_t">;
-  PtrType TssTPtr = PtrType<TssTType>;
-  NamedType TssDtorTType = NamedType<"tss_dtor_t">;
-
-  HeaderSpec Assert = HeaderSpec<
-      "assert.h",
-      [
-          Macro<"static_assert">,
-          Macro<"assert">,
-      ],
-      [], // Types
-      [], // Enumerations
-      []
-  >;
-
-  FunctionAttrSpec ConstAttr = FunctionAttrSpec<"__LIBC_CONST_ATTR", [
-    Cxx11FunctionAttr<"const", "gnu">,
-    GnuFunctionAttr<"const">,
-  ]>;
-
-  HeaderSpec CType = HeaderSpec<
-      "ctype.h",
-      [], // Macros
-      [
-          LocaleT
-      ], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "isalnum",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isalpha",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isblank",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "iscntrl",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isdigit",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isgraph",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "islower",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isprint",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "ispunct",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isspace",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isupper",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isxdigit",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "tolower",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "toupper",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "isalnum_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isalpha_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isblank_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "iscntrl_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isdigit_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isgraph_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "islower_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isprint_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "ispunct_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isspace_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isupper_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "isxdigit_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "tolower_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "toupper_l",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<LocaleT>]
-          >,
-      ]
-  >;
-
-  NamedType FEnvT = NamedType<"fenv_t">;
-  PtrType FEnvTPtr = PtrType<FEnvT>;
-  ConstType ConstFEnvTPtr = ConstType<FEnvTPtr>;
-  NamedType FExceptT = NamedType<"fexcept_t">;
-  PtrType FExceptTPtr = PtrType<FExceptT>;
-  ConstType ConstFExceptTPtr = ConstType<FExceptTPtr>;
-  HeaderSpec Fenv = HeaderSpec<
-      "fenv.h",
-      [
-          Macro<"FE_DIVBYZERO">,
-          Macro<"FE_INEXACT">,
-          Macro<"FE_INVALID">,
-          Macro<"FE_OVERFLOW">,
-          Macro<"FE_UNDERFLOW">,
-          Macro<"FE_ALL_EXCEPT">,
-
-          Macro<"FE_DOWNWARD">,
-          Macro<"FE_TONEAREST">,
-          Macro<"FE_TOWARDZERO">,
-          Macro<"FE_UPWARD">,
-
-          Macro<"FE_DFL_ENV">
-      ],
-      [
-          FEnvT,
-          FExceptT,
-      ], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "feclearexcept",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fetestexcept",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fetestexceptflag",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstFExceptTPtr>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "feraiseexcept",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fesetround",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fegetround",
-              RetValSpec<IntType>,
-              []
-          >,
-          FunctionSpec<
-              "fegetenv",
-              RetValSpec<IntType>,
-              [ArgSpec<FEnvTPtr>]
-          >,
-          FunctionSpec<
-              "fesetenv",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstFEnvTPtr>]
-          >,
-          FunctionSpec<
-              "fegetexceptflag",
-              RetValSpec<IntType>,
-              [ArgSpec<FExceptTPtr>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fesetexcept",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fesetexceptflag",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstFExceptTPtr>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "feholdexcept",
-              RetValSpec<IntType>,
-              [ArgSpec<FEnvTPtr>]
-          >,
-          FunctionSpec<
-              "feupdateenv",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstFEnvTPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec String = HeaderSpec<
-      "string.h",
-      [
-          Macro<"NULL">,
-      ],
-      [
-          SizeTType,
-      ],
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "memcpy",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<VoidRestrictedPtr>,
-               ArgSpec<ConstVoidRestrictedPtr>,
-               ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "memmove",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<VoidPtr>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "memcmp",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstVoidPtr>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "memchr",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<ConstVoidPtr>, ArgSpec<IntType>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "memset",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<VoidPtr>, ArgSpec<IntType>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "memset_explicit",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<VoidPtr>, ArgSpec<IntType>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strcpy",
-              RetValSpec<CharPtr>,
-              [ArgSpec<CharRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>]
-          >,
-          FunctionSpec<
-              "strncpy",
-              RetValSpec<CharPtr>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strcat",
-              RetValSpec<CharPtr>,
-              [ArgSpec<CharRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>]
-          >,
-          FunctionSpec<
-              "strncat",
-              RetValSpec<CharPtr>,
-              [ArgSpec<CharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strcmp",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strcoll",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strcoll_l",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "strncmp",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strxfrm",
-              RetValSpec<SizeTType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strxfrm_l",
-              RetValSpec<SizeTType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<LocaleT>]
-          >,
-          FunctionSpec<
-              "strchr",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "strcspn",
-              RetValSpec<SizeTType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strdup",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strndup",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>,ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "strpbrk",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strrchr",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "strspn",
-              RetValSpec<SizeTType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strstr",
-              RetValSpec<CharPtr>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "strtok",
-              RetValSpec<CharPtr>,
-              [ArgSpec<CharRestrictedPtr>, ArgSpec<ConstCharRestrictedPtr>]
-          >,
-          FunctionSpec<
-              "strerror",
-              RetValSpec<CharPtr>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "strlen",
-              RetValSpec<SizeTType>,
-              [ArgSpec<ConstCharPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Math = HeaderSpec<
-      "math.h",
-      [
-          Macro<"MATH_ERRNO">,
-          Macro<"MATH_ERREXCEPT">,
-          Macro<"math_errhandling">,
-
-          Macro<"HUGE_VAL">,
-          Macro<"INFINITY">,
-          Macro<"NAN">,
-
-          Macro<"FP_INT_UPWARD">,
-          Macro<"FP_INT_DOWNWARD">,
-          Macro<"FP_INT_TOWARDZERO">,
-          Macro<"FP_INT_TONEARESTFROMZERO">,
-          Macro<"FP_INT_TONEAREST">,
-
-          Macro<"FP_ILOGB0">,
-          Macro<"FP_ILOGBNAN">,
-
-          Macro<"isfinite">,
-          Macro<"isinf">,
-          Macro<"isnan">,
-      ],
-      [
-          NamedType<"float_t">,
-          NamedType<"double_t">,
-          NamedType<"float128">,
-      ],
-      [], // Enumerations
-      [
-          FunctionSpec<"cbrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"cbrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"copysign", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"copysignf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"copysignl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"copysignf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"copysignf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"ceil", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"ceilf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"ceill", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"ceilf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"daddl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"ddivl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"dfmal", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"dsubl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"fabs", RetValSpec<DoubleType>, [ArgSpec<DoubleType>], [ConstAttr]>,
-          FunctionSpec<"fabsf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fabsf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fabsf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fadd", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"faddl", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"fdim", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fdimf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fdiml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fdimf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fdimf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fdiv", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fdivl", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"ffma", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"ffmal", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"floor", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"floorf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"floorl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"floorf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"floorf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fmin", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fminf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fminl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"fminf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"fmax", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-          GuardedFunctionSpec<"fmaxf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-	  FunctionSpec<"fmaximum", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaximumf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaximuml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmaximumf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fmaximumf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fmaximum_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaximum_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaximum_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmaximum_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fmaximum_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fmaximum_mag", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaximum_magf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaximum_magl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmaximum_magf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fmaximum_magf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fmaximum_mag_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaximum_mag_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaximum_mag_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmaximum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fmaximum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fminimum", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fminimumf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fminimuml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fminimumf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fminimumf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fminimum_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fminimum_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmaximum_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fminimum_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fminimum_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fminimum_mag", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fminimum_magf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fminimum_magl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fminimum_magf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fminimum_magf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-	  FunctionSpec<"fminimum_mag_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fminimum_mag_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fminimum_mag_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fminimum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fminimum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-
-          GuardedFunctionSpec<"f16fmaf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"fmodl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"fmodf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fmodf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
-          FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
-          FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
-          GuardedFunctionSpec<"frexpf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"frexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fromfp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"fromfpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"fromfpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          GuardedFunctionSpec<"fromfpf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fromfpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fromfpx", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"fromfpxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"fromfpxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          GuardedFunctionSpec<"fromfpxf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"fromfpxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"fsub", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fsubl", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"ufromfp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"ufromfpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"ufromfpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          GuardedFunctionSpec<"ufromfpf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"ufromfpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"ufromfpx", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"ufromfpxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"ufromfpxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>]>,
-          GuardedFunctionSpec<"ufromfpxf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"ufromfpxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>, ArgSpec<UnsignedIntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"hypot", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"hypotf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-
-          FunctionSpec<"ilogb", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"ilogbf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"ilogbl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"ilogbf16", RetValSpec<IntType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"ilogbf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"llogb", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"llogbf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"llogbl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"llogbf16", RetValSpec<LongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"llogbf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"ldexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
-          FunctionSpec<"ldexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
-          FunctionSpec<"ldexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>]>,
-          GuardedFunctionSpec<"ldexpf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"ldexpf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"log10", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"log10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"log10f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"log1p", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"log1pf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"log2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"log2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"log2f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"log", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"logf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"logf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"logbf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"logbl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"logbf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"logbf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"modf", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoublePtr>]>,
-          FunctionSpec<"modff", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatPtr>]>,
-          FunctionSpec<"modfl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoublePtr>]>,
-          GuardedFunctionSpec<"modff16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"modff128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"cos", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"cosf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"sin", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"sinf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"tan", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"tanf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"erff", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"exp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"expf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"expf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"exp2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"exp2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"exp2f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"exp2m1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"exp2m1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"expm1", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"expm1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"expm1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"exp10", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"exp10f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"exp10m1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"exp10m1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"remainder", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"remainderl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"remainderf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"remainderf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"remquo", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
-          FunctionSpec<"remquof", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
-          FunctionSpec<"remquol", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
-          GuardedFunctionSpec<"remquof16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"remquof128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<IntPtr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"round", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"roundf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"roundl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"roundf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"roundf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"roundeven", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"roundevenf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"roundevenl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"roundevenf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"roundevenf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"lround", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"lroundf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"lroundl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"lroundf16", RetValSpec<LongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"lroundf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"llround", RetValSpec<LongLongType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"llroundf", RetValSpec<LongLongType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"llroundl", RetValSpec<LongLongType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"llroundf16", RetValSpec<LongLongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"llroundf128", RetValSpec<LongLongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"rint", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"rintf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"rintl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"rintf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"rintf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"lrint", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"lrintf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"lrintl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"lrintf16", RetValSpec<LongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"lrintf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"llrint", RetValSpec<LongLongType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"llrintf", RetValSpec<LongLongType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"llrintl", RetValSpec<LongLongType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"llrintf16", RetValSpec<LongLongType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"llrintf128", RetValSpec<LongLongType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"sqrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"sqrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"sqrtl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"sqrtf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"trunc", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"truncf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"truncl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"truncf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"truncf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"nearbyint", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"nearbyintf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"nearbyintl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"nearbyintf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"nearbyintf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"nextafterf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"nextafter", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"nextafterl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"nextafterf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"nextafterf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"nexttowardf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"nexttoward", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongDoubleType>]>,
-          FunctionSpec<"nexttowardl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"nexttowardf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"nextdown", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"nextdownf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"nextdownl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"nextdownf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"nextdownf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"nextup", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"nextupf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"nextupl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"nextupf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"nextupf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"powf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"pow", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-
-          FunctionSpec<"coshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"coshf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"sinhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"sinhf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"tanhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          GuardedFunctionSpec<"tanhf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-
-          FunctionSpec<"acosf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"asinf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"asin", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-
-          FunctionSpec<"atanf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"atan2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"atan2f", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"atan2l", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"acoshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"asinhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"atanhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-
-          FunctionSpec<"scalbln", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongType>]>,
-          FunctionSpec<"scalblnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<LongType>]>,
-          FunctionSpec<"scalblnl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongType>]>,
-          GuardedFunctionSpec<"scalblnf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"scalblnf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"scalbn", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
-          FunctionSpec<"scalbnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
-          FunctionSpec<"scalbnl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntType>]>,
-          GuardedFunctionSpec<"scalbnf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<IntType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"scalbnf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<IntType>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"nanf", RetValSpec<FloatType>, [ArgSpec<ConstCharPtr>]>,
-          FunctionSpec<"nan", RetValSpec<DoubleType>, [ArgSpec<ConstCharPtr>]>,
-          FunctionSpec<"nanl", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharPtr>]>,
-          GuardedFunctionSpec<"nanf16", RetValSpec<Float16Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"nanf128", RetValSpec<Float128Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"issignaling", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"issignalingf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"issignalingl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"issignalingf16", RetValSpec<IntType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"issignalingf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"canonicalize", RetValSpec<IntType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"canonicalizef", RetValSpec<IntType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
-          FunctionSpec<"canonicalizel", RetValSpec<IntType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"canonicalizef16", RetValSpec<IntType>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"canonicalizef128", RetValSpec<IntType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"iscanonical", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"iscanonicalf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"iscanonicall", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"iscanonicalf16", RetValSpec<IntType>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"iscanonicalf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"dsqrtl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"totalorder", RetValSpec<IntType>, [ArgSpec<ConstDoublePtr>, ArgSpec<ConstDoublePtr>]>,
-	  FunctionSpec<"totalorderf", RetValSpec<IntType>, [ArgSpec<ConstFloatPtr>, ArgSpec<ConstFloatPtr>]>,
-	  FunctionSpec<"totalorderl", RetValSpec<IntType>, [ArgSpec<ConstLongDoublePtr>, ArgSpec<ConstLongDoublePtr>]>,
-	  GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<ConstFloat16Ptr>, ArgSpec<ConstFloat16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-	  GuardedFunctionSpec<"totalorderf128", RetValSpec<IntType>, [ArgSpec<ConstFloat128Ptr>, ArgSpec<ConstFloat128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"totalordermag", RetValSpec<IntType>, [ArgSpec<ConstDoublePtr>, ArgSpec<ConstDoublePtr>]>,
-          FunctionSpec<"totalordermagf", RetValSpec<IntType>, [ArgSpec<ConstFloatPtr>, ArgSpec<ConstFloatPtr>]>,
-          FunctionSpec<"totalordermagl", RetValSpec<IntType>, [ArgSpec<ConstLongDoublePtr>, ArgSpec<ConstLongDoublePtr>]>,
-          GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<ConstFloat16Ptr>, ArgSpec<ConstFloat16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"totalordermagf128", RetValSpec<IntType>, [ArgSpec<ConstFloat128Ptr>, ArgSpec<ConstFloat128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"getpayload", RetValSpec<DoubleType>, [ArgSpec<DoublePtr>]>,
-	  FunctionSpec<"getpayloadf", RetValSpec<FloatType>, [ArgSpec<FloatPtr>]>,
-          FunctionSpec<"getpayloadl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoublePtr>]>,
-          GuardedFunctionSpec<"getpayloadf16", RetValSpec<Float16Type>, [ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"getpayloadf128", RetValSpec<Float128Type>, [ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"setpayload", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"setpayloadf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatType>]>,
-	  FunctionSpec<"setpayloadl", RetValSpec<IntType>, [ArgSpec<LongDoublePtr>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"setpayloadf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-	  GuardedFunctionSpec<"setpayloadf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          FunctionSpec<"setpayloadsig", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"setpayloadsigf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatType>]>,
-	  FunctionSpec<"setpayloadsigl", RetValSpec<IntType>, [ArgSpec<LongDoublePtr>, ArgSpec<LongDoubleType>]>,
-          GuardedFunctionSpec<"setpayloadsigf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
-	  GuardedFunctionSpec<"setpayloadsigf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
-
-          GuardedFunctionSpec<"f16addf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          GuardedFunctionSpec<"f16subf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          FunctionSpec<"fmul", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"fmull", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"dmull", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
-          GuardedFunctionSpec<"f16mulf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          FunctionSpec<"fsqrt", RetValSpec<FloatType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"fsqrtl", RetValSpec<FloatType>, [ArgSpec<LongDoubleType>]>,
-
-          GuardedFunctionSpec<"f16divf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          GuardedFunctionSpec<"f16sqrtf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
-
-          FunctionSpec<"lgamma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
-          FunctionSpec<"lgammaf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
-          FunctionSpec<"lgammal", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
-      ]
-  >;
-
-  HeaderSpec StdIO = HeaderSpec<
-      "stdio.h",
-      [
-          Macro<"stdin">,
-          Macro<"stderr">,
-          Macro<"stdout">,
-          Macro<"_IOFBF">,
-          Macro<"_IOLBF">,
-          Macro<"_IONBF">,
-          Macro<"EOF">,
-      ], // Macros
-      [ // Types
-          SizeTType,
-          FILE,
-      ],
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "clearerr",
-              RetValSpec<VoidType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "fclose",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "feof",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "ferror",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "fgetc",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "fgets",
-              RetValSpec<CharPtr>,
-              [
-                ArgSpec<CharRestrictedPtr>,
-                ArgSpec<IntType>,
-                ArgSpec<FILERestrictedPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "fflush",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "fopen",
-              RetValSpec<FILEPtr>,
-              [ArgSpec<ConstCharPtr>,
-               ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "fputc",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>,
-               ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "ftell",
-              RetValSpec<LongType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "getc",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "getchar",
-              RetValSpec<IntType>,
-              [ArgSpec<VoidType>]
-          >,
-          FunctionSpec<
-              "putc",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>,
-               ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "putchar",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fputs",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<FILERestrictedPtr>]
-          >,
-          FunctionSpec<
-              "puts",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>]
-          >,
-          FunctionSpec<
-              "fread",
-              RetValSpec<SizeTType>,
-              [ArgSpec<VoidRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<SizeTType>,
-               ArgSpec<FILERestrictedPtr>]
-          >,
-          FunctionSpec<
-              "fseek",
-              RetValSpec<IntType>,
-              [ArgSpec<FILEPtr>,
-               ArgSpec<LongType>,
-               ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "fwrite",
-              RetValSpec<SizeTType>,
-              [ArgSpec<ConstVoidRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<SizeTType>,
-               ArgSpec<FILERestrictedPtr>]
-          >,
-          FunctionSpec<
-              "remove",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "rename",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
-          >,
-          FunctionSpec<
-              "setbuf",
-              RetValSpec<VoidType>,
-              [ArgSpec<FILERestrictedPtr>, ArgSpec<CharRestrictedPtr>]
-          >,
-          FunctionSpec<
-              "setvbuf",
-              RetValSpec<IntType>,
-              [ArgSpec<FILERestrictedPtr>, ArgSpec<CharRestrictedPtr>, ArgSpec<IntType>, ArgSpec<SizeTType>]
-          >,
-          FunctionSpec<
-              "sscanf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "vsscanf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "scanf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "vscanf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "fscanf",
-              RetValSpec<IntType>,
-              [ArgSpec<FILERestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "vfscanf",
-              RetValSpec<IntType>,
-              [ArgSpec<FILERestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "sprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "snprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "printf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "fprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<FILERestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "asprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtrPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VarArgType>]
-          >,
-          FunctionSpec<
-              "vsprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "vsnprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtr>,
-               ArgSpec<SizeTType>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "vprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "vfprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<FILERestrictedPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-          FunctionSpec<
-              "ungetc",
-              RetValSpec<IntType>,
-              [ArgSpec<IntType>, ArgSpec<FILEPtr>]
-          >,
-          FunctionSpec<
-              "vasprintf",
-              RetValSpec<IntType>,
-              [ArgSpec<CharRestrictedPtrPtr>,
-               ArgSpec<ConstCharRestrictedPtr>,
-               ArgSpec<VaListType>]
-          >,
-      ],
-      [
-          ObjectSpec<
-              "stdin",
-              "FILE *"
-          >,
-          ObjectSpec<
-              "stdout",
-              "FILE *"
-          >,
-          ObjectSpec<
-              "stderr",
-              "FILE *"
-          >,
-      ]
-  >;
-
-  HeaderSpec StdBit = HeaderSpec<
-      "stdbit.h",
-      [
-        Macro<"__STDC_VERSION_STDBIT_H__">,
-        Macro<"__STDC_ENDIAN_LITTLE__">,
-        Macro<"__STDC_ENDIAN_BIG__">,
-        Macro<"__STDC_ENDIAN_NATIVE__">,
-        Macro<"stdc_leading_zeros">,
-        Macro<"stdc_leading_ones">,
-        Macro<"stdc_trailing_zeros">,
-        Macro<"stdc_trailing_ones">,
-        Macro<"stdc_first_leading_zero">,
-        Macro<"stdc_first_leading_one">,
-        Macro<"stdc_first_trailing_zero">,
-        Macro<"stdc_first_trailing_one">,
-        Macro<"stdc_count_zeros">,
-        Macro<"stdc_count_ones">,
-        Macro<"stdc_has_single_bit">,
-        Macro<"stdc_bit_width">,
-        Macro<"stdc_bit_floor">,
-        Macro<"stdc_bit_ceil">
-      ], // Macros
-      [], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<"stdc_leading_zeros_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_leading_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_leading_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_leading_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_leading_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_leading_ones_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_leading_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_leading_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_leading_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_leading_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_trailing_zeros_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_trailing_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_trailing_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_trailing_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_trailing_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_trailing_ones_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_trailing_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_trailing_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_trailing_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_trailing_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_first_leading_zero_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_first_leading_zero_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_first_leading_zero_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_first_leading_zero_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_first_leading_zero_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_first_leading_one_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_first_leading_one_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_first_leading_one_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_first_leading_one_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_first_leading_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_first_trailing_one_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_first_trailing_one_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_first_trailing_one_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_first_trailing_one_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_first_trailing_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_count_zeros_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_count_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_count_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_count_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_count_ones_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_has_single_bit_uc", RetValSpec<BoolType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_has_single_bit_us", RetValSpec<BoolType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_has_single_bit_ui", RetValSpec<BoolType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_has_single_bit_ul", RetValSpec<BoolType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_has_single_bit_ull", RetValSpec<BoolType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_bit_width_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_bit_width_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_bit_width_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_bit_width_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_bit_width_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_bit_floor_uc", RetValSpec<UnsignedCharType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_bit_floor_us", RetValSpec<UnsignedShortType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_bit_floor_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_bit_floor_ul", RetValSpec<UnsignedLongType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_bit_floor_ull", RetValSpec<UnsignedLongLongType>, [ArgSpec<UnsignedLongLongType>]>,
-          FunctionSpec<"stdc_bit_ceil_uc", RetValSpec<UnsignedCharType>, [ArgSpec<UnsignedCharType>]>,
-          FunctionSpec<"stdc_bit_ceil_us", RetValSpec<UnsignedShortType>, [ArgSpec<UnsignedShortType>]>,
-          FunctionSpec<"stdc_bit_ceil_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
-          FunctionSpec<"stdc_bit_ceil_ul", RetValSpec<UnsignedLongType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_bit_ceil_ull", RetValSpec<UnsignedLongLongType>, [ArgSpec<UnsignedLongLongType>]>
-      ] // Functions
-  >;
-
-  HeaderSpec StdCkdInt = HeaderSpec<
-      "stdckdint.h",
-      [
-        Macro<"__STDC_VERSION_STDCKDINT_H__">,
-        Macro<"ckd_add">,
-        Macro<"ckd_sub">,
-        Macro<"ckd_mul">
-      ], // Macros
-      [], // Types
-      [], // Enumerations
-      [] // Functions
-  >;
-
-  HeaderSpec StdLib = HeaderSpec<
-      "stdlib.h",
-      [], // Macros
-      [
-          DivTType,
-          LDivTType,
-          LLDivTType,
-          SizeTType,
-          BSearchCompareT,
-          QSortCompareT,
-          AtexitHandlerT,
-      ], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<"abort", RetValSpec<NoReturn>, [ArgSpec<VoidType>]>,
-
-          FunctionSpec<"bsearch", RetValSpec<VoidPtr>, [ArgSpec<ConstVoidPtr>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>, ArgSpec<SizeTType>, ArgSpec<BSearchCompareT>]>,
-
-          FunctionSpec<"abs", RetValSpec<IntType>, [ArgSpec<IntType>]>,
-          FunctionSpec<"labs", RetValSpec<LongType>, [ArgSpec<LongType>]>,
-          FunctionSpec<"llabs", RetValSpec<LongLongType>, [ArgSpec<LongLongType>]>,
-
-          FunctionSpec<"atof", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>]>,
-          FunctionSpec<"atoi", RetValSpec<IntType>, [ArgSpec<ConstCharPtr>]>,
-          FunctionSpec<"atol", RetValSpec<LongType>, [ArgSpec<ConstCharPtr>]>,
-          FunctionSpec<"atoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharPtr>]>,
-
-          FunctionSpec<"div", RetValSpec<DivTType>, [ArgSpec<IntType>, ArgSpec<IntType>]>,
-          FunctionSpec<"ldiv", RetValSpec<LDivTType>, [ArgSpec<LongType>, ArgSpec<LongType>]>,
-          FunctionSpec<"lldiv", RetValSpec<LLDivTType>, [ArgSpec<LongLongType>, ArgSpec<LongLongType>]>,
-
-          FunctionSpec<"qsort", RetValSpec<VoidType>, [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<SizeTType>, ArgSpec<QSortCompareT>]>,
-
-          FunctionSpec<"rand", RetValSpec<IntType>, [ArgSpec<VoidType>]>,
-          FunctionSpec<"srand", RetValSpec<VoidType>, [ArgSpec<UnsignedIntType>]>,
-
-          FunctionSpec<"strfromf", RetValSpec<IntType>, [ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>, ArgSpec<ConstCharRestrictedPtr>, ArgSpec<FloatType>]>,
-          FunctionSpec<"strfromd", RetValSpec<IntType>, [ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>, ArgSpec<ConstCharRestrictedPtr>, ArgSpec<DoubleType>]>,
-          FunctionSpec<"strfroml", RetValSpec<IntType>, [ArgSpec<CharRestrictedPtr>, ArgSpec<SizeTType>, ArgSpec<ConstCharRestrictedPtr>, ArgSpec<LongDoubleType>]>,
-
-          FunctionSpec<"strtof", RetValSpec<FloatType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>]>,
-          FunctionSpec<"strtod", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>]>,
-          FunctionSpec<"strtold", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>]>,
-          FunctionSpec<"strtol", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-          FunctionSpec<"strtoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-          FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-          FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-
-          FunctionSpec<"strtof_l", RetValSpec<FloatType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtod_l", RetValSpec<DoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtold_l", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtol_l", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoll_l", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoul_l", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-          FunctionSpec<"strtoull_l", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>, ArgSpec<LocaleT>]>,
-
-          FunctionSpec<"malloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>]>,
-          FunctionSpec<"calloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>, ArgSpec<SizeTType>]>,
-          FunctionSpec<"realloc", RetValSpec<VoidPtr>, [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]>,
-          FunctionSpec<"aligned_alloc", RetValSpec<VoidPtr>, [ArgSpec<SizeTType>, ArgSpec<SizeTType>]>,
-          FunctionSpec<"free", RetValSpec<VoidType>, [ArgSpec<VoidPtr>]>,
-
-          FunctionSpec<"_Exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
-          FunctionSpec<"at_quick_exit", RetValSpec<IntType>, [ArgSpec<AtexitHandlerT>]>,
-          FunctionSpec<"atexit", RetValSpec<IntType>, [ArgSpec<AtexitHandlerT>]>,
-          FunctionSpec<"exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
-          FunctionSpec<"quick_exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
-
-          FunctionSpec<"system", RetValSpec<IntType>, [ArgSpec<ConstCharPtr>]>,
-      ]
-  >;
-
-  NamedType IMaxDivTType = NamedType<"imaxdiv_t">;
-
-  HeaderSpec IntTypes = HeaderSpec<
-      "inttypes.h",
-      [
-        Macro<"__STDC_VERSION_INTTYPES_H__">,
-      ], // Macros
-      [
-        IMaxDivTType,
-      ], // Types
-      [], // Enumerations
-      [
-          FunctionSpec<"imaxabs", RetValSpec<IntMaxTType>, [ArgSpec<IntMaxTType>]>,
-          FunctionSpec<"imaxdiv", RetValSpec<IMaxDivTType>, [ArgSpec<IntMaxTType>, ArgSpec<IntMaxTType>]>,
-          FunctionSpec<"strtoimax", RetValSpec<IntMaxTType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-          FunctionSpec<"strtoumax", RetValSpec<UIntMaxTType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
-      ]
-  >;
-
-  HeaderSpec Errno = HeaderSpec<
-      "errno.h",
-      [
-        Macro<"errno">,
-        Macro<"EDOM">,
-        Macro<"EILSEQ">,
-        Macro<"ERANGE">,
-      ]
-  >;
-
-  HeaderSpec Float = HeaderSpec<
-      "float.h",
-      [
-        Macro<"FLT_MANT_DIG">,
-        Macro<"DBL_MANT_DIG">,
-        Macro<"LDBL_MANT_DIG">,
-      ]
-  >;
-
-  HeaderSpec StdInt = HeaderSpec<"StdInt.h">;
-
-  HeaderSpec Limits = HeaderSpec<"limits.h">;
-
-  NamedType SigAtomicT = NamedType<"sig_atomic_t">;
-  HeaderSpec Signal = HeaderSpec<
-      "signal.h",
-      [
-        Macro<"SIG_BLOCK">,
-        Macro<"SIG_UNBLOCK">,
-        Macro<"SIG_SETMASK">,
-
-        Macro<"SIGABRT">,
-        Macro<"SIGFPE">,
-        Macro<"SIGILL">,
-        Macro<"SIGINT">,
-        Macro<"SIGSEGV">,
-        Macro<"SIGTERM">
-      ],
-      [
-        SizeTType,
-        SigAtomicT,
-        SigHandlerT,
-      ],
-      [], // Enumerations
-      [
-        FunctionSpec<"raise", RetValSpec<IntType>, [ArgSpec<IntType>]>,
-        FunctionSpec<
-          "signal",
-          RetValSpec<SigHandlerT>,
-          [ArgSpec<IntType>, ArgSpec<SigHandlerT>]
-        >,
-      ]
-  >;
-
-  HeaderSpec Threads = HeaderSpec<
-      "threads.h",
-      [
-          Macro<"ONCE_FLAG_INIT">,
-      ],
-      [
-          OnceFlagType,
-          CallOnceFuncType,
-          CndTType,
-          MtxTType,
-          ThrdStartTType,
-          ThrdTType,
-          TssTType,
-          TssDtorTType,
-      ],
-      [
-          EnumeratedNameValue<"mtx_plain">,
-          EnumeratedNameValue<"mtx_recursive">,
-          EnumeratedNameValue<"mtx_timed">,
-          EnumeratedNameValue<"thrd_timedout">,
-          EnumeratedNameValue<"thrd_success">,
-          EnumeratedNameValue<"thrd_busy">,
-          EnumeratedNameValue<"thrd_error">,
-          EnumeratedNameValue<"thrd_nomem">,
-      ],
-      [
-          FunctionSpec<
-              "call_once",
-              RetValSpec<VoidType>,
-              [
-                  ArgSpec<OnceFlagTypePtr>,
-                  ArgSpec<CallOnceFuncType>,
-              ]
-          >,
-          FunctionSpec<
-              "cnd_broadcast",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<CndTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "cnd_destroy",
-              RetValSpec<VoidType>,
-              [
-                  ArgSpec<CndTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "cnd_init",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<CndTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "cnd_signal",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<CndTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "cnd_wait",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<CndTTypePtr>,
-                  ArgSpec<MtxTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "mtx_init",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<MtxTTypePtr>,
-                  ArgSpec<IntType>,
-              ]
-          >,
-          FunctionSpec<
-              "mtx_destroy",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<VoidType>,
-              ]
-          >,
-          FunctionSpec<
-              "mtx_lock",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<MtxTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "mtx_unlock",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<MtxTTypePtr>,
-              ]
-          >,
-          FunctionSpec<
-              "thrd_create",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<ThrdTTypePtr>,
-                  ArgSpec<ThrdStartTType>,
-                  ArgSpec<VoidPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "thrd_join",
-              RetValSpec<IntType>,
-              [
-                  ArgSpec<ThrdTType>,
-                  ArgSpec<IntPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "thrd_detach",
-              RetValSpec<IntType>,
-              [ArgSpec<ThrdTType>]
-          >,
-          FunctionSpec<
-              "thrd_current",
-              RetValSpec<ThrdTType>,
-              [ArgSpec<VoidType>]
-          >,
-          FunctionSpec<
-              "thrd_equal",
-              RetValSpec<IntType>,
-              [ArgSpec<ThrdTType>, ArgSpec<ThrdTType>]
-          >,
-          FunctionSpec<
-              "thrd_exit",
-              RetValSpec<VoidType>,
-              [ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "tss_create",
-              RetValSpec<IntType>,
-              [ArgSpec<TssTPtr>, ArgSpec<TssDtorTType>]
-          >,
-          FunctionSpec<
-              "tss_delete",
-              RetValSpec<IntType>,
-              [ArgSpec<TssTType>]
-          >,
-          FunctionSpec<
-              "tss_get",
-              RetValSpec<VoidPtr>,
-              [ArgSpec<TssTType>]
-          >,
-          FunctionSpec<
-              "tss_set",
-              RetValSpec<IntType>,
-              [ArgSpec<TssTType>, ArgSpec<VoidPtr>]
-          >,
-      ]
-  >;
-
-  HeaderSpec Time = HeaderSpec<
-      "time.h",
-      [], // Macros
-      [ // Types
-         ClockT,
-         StructTmType,
-         StructTimeSpec,
-         TimeTType,
-         SizeTType,
-      ],
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "asctime",
-              RetValSpec<CharPtr>,
-              [ArgSpec<StructTmPtr>]
-          >,
-          FunctionSpec<
-              "asctime_r",
-              RetValSpec<CharPtr>,
-              [
-                  ArgSpec<StructTmPtr>,
-                  ArgSpec<CharPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "ctime",
-              RetValSpec<CharPtr>,
-              [ArgSpec<TimeTTypePtr>]
-          >,
-          FunctionSpec<
-              "ctime_r",
-              RetValSpec<CharPtr>,
-              [
-                  ArgSpec<TimeTTypePtr>,
-                  ArgSpec<CharPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "clock",
-              RetValSpec<ClockT>,
-              [ArgSpec<VoidType>]
-          >,
-          FunctionSpec<
-              "difftime",
-              RetValSpec<DoubleType>,
-              [
-                  ArgSpec<TimeTType>,
-                  ArgSpec<TimeTType>,
-              ]
-          >,
-          FunctionSpec<
-              "gmtime",
-              RetValSpec<StructTmPtr>,
-              [ArgSpec<TimeTTypePtr>]
-          >,
-          FunctionSpec<
-              "gmtime_r",
-              RetValSpec<StructTmPtr>,
-              [
-                  ArgSpec<TimeTTypePtr>,
-                  ArgSpec<StructTmPtr>,
-              ]
-          >,
-          FunctionSpec<
-              "mktime",
-              RetValSpec<TimeTType>,
-              [ArgSpec<StructTmPtr>]
-          >,
-          FunctionSpec<
-              "time",
-              RetValSpec<TimeTType>,
-              [ArgSpec<TimeTTypePtr>]
-          >,
-          FunctionSpec<
-              "timespec_get",
-              RetValSpec<IntType>,
-              [
-                ArgSpec<StructTimeSpecPtr>,
-                ArgSpec<IntType>,
-              ]
-          >,
-      ]
-  >;
-
-  HeaderSpec SetJmp = HeaderSpec<
-      "setjmp.h",
-      [], // Macros
-      [JmpBuf],
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "longjmp",
-              RetValSpec<NoReturn>,
-              [ArgSpec<JmpBuf>, ArgSpec<IntType>]
-          >,
-          FunctionSpec<
-              "setjmp",
-              RetValSpec<IntType>,
-              [ArgSpec<JmpBuf>]
-          >,
-          FunctionSpec<
-              "longjmp",
-              RetValSpec<VoidType>,
-              [ArgSpec<JmpBuf>, ArgSpec<IntType>]
-          >,
-      ]
-  >;
-
-  HeaderSpec UChar = HeaderSpec<
-      "uchar.h",
-      [], // Macros
-      [ //Types
-        MBStateTType,
-        Char8TType,
-        Char16TType,
-        Char32TType,
-        SizeTType,
-      ],
-      [], // Enumerations
-      []
-  >;
-
-  HeaderSpec WChar = HeaderSpec<
-      "wchar.h",
-      [ // Macros
-        Macro<"WEOF">,
-      ],
-      [ //Types
-        MBStateTType,
-        SizeTType,
-        WIntType,
-        WCharType,
-      ],
-      [], // Enumerations
-      [
-          FunctionSpec<
-              "wctob",
-              RetValSpec<IntType>,
-              [ArgSpec<WIntType>]
-          >,
-      ]
-  >;
-
-  NamedType StructLconv = NamedType<"struct lconv">;
-  PtrType StructLconvPtr = PtrType<StructLconv>;
-
-  HeaderSpec Locale = HeaderSpec<
-     "locale.h",
-      [], // Macros
-      [LocaleT, StructLconv], // Types
-      [], // Enumerations
-      [
-        FunctionSpec<
-          "duplocale",
-          RetValSpec<LocaleT>,
-          [
-            ArgSpec<LocaleT>
-          ]
-        >,
-        FunctionSpec<
-          "freelocale",
-          RetValSpec<VoidType>,
-          [
-            ArgSpec<LocaleT>
-          ]
-        >,
-        FunctionSpec<
-          "localeconv",
-          RetValSpec<StructLconvPtr>,
-          []
-        >,
-        FunctionSpec<
-          "newlocale",
-          RetValSpec<LocaleT>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<ConstCharPtr>,
-            ArgSpec<LocaleT>
-          ]
-        >,
-        FunctionSpec<
-          "setlocale",
-          RetValSpec<CharPtr>,
-          [
-            ArgSpec<IntType>,
-            ArgSpec<ConstCharPtr>
-          ]
-        >,
-        FunctionSpec<
-          "uselocale",
-          RetValSpec<LocaleT>,
-          [
-            ArgSpec<LocaleT>
-          ]
-        >
-      ]  // Functions
-  >;
-
-  let Headers = [
-    Assert,
-    CType,
-    Errno,
-    Fenv,
-    Float,
-    StdInt,
-    Limits,
-    Math,
-    String,
-    StdBit,
-    StdCkdInt,
-    StdIO,
-    StdLib,
-    IntTypes,
-    SetJmp,
-    Signal,
-    Threads,
-    Time,
-    UChar,
-    WChar,
-    Locale,
-  ];
-}
diff --git a/libc/spec/stdc_ext.td b/libc/spec/stdc_ext.td
deleted file mode 100644
index dee3b8bdf6fee..0000000000000
--- a/libc/spec/stdc_ext.td
+++ /dev/null
@@ -1,82 +0,0 @@
-// Fixed point types.
-// From ISO/IEC TR 18037:2008 standard:
-// https://standards.iso.org/ittf/PubliclyAvailableStandards/c051126_ISO_IEC_TR_18037_2008.zip
-def ShortFractType : NamedType<"short fract">;
-def FractType : NamedType<"fract">;
-def LongFractType : NamedType<"long fract">;
-def UnsignedShortFractType : NamedType<"unsigned short fract">;
-def UnsignedFractType : NamedType<"unsigned fract">;
-def UnsignedLongFractType : NamedType<"unsigned long fract">;
-
-def ShortAccumType : NamedType<"short accum">;
-def AccumType : NamedType<"accum">;
-def LongAccumType : NamedType<"long accum">;
-def UnsignedShortAccumType : NamedType<"unsigned short accum">;
-def UnsignedAccumType : NamedType<"unsigned accum">;
-def UnsignedLongAccumType : NamedType<"unsigned long accum">;
-
-def IntHrT : NamedType <"int_hr_t">;
-def IntRT : NamedType<"int_r_t">;
-def IntLrT : NamedType<"int_lr_t">;
-def IntHkT : NamedType<"int_hk_t">;
-def IntKT : NamedType<"int_k_t">;
-def IntLkT : NamedType<"int_lk_t">;
-def UIntUhrT : NamedType<"uint_uhr_t">;
-def UIntUrT : NamedType<"uint_ur_t">;
-def UIntUlrT : NamedType<"uint_ulr_t">;
-def UIntUhkT : NamedType<"uint_uhk_t">;
-def UIntUkT : NamedType<"uint_uk_t">;
-def UIntUlkT : NamedType<"uint_ulk_t">;
-
-def StdcExt : StandardSpec<"stdc_ext"> {
-  // From ISO/IEC TR 18037:2008 standard:
-  // https://standards.iso.org/ittf/PubliclyAvailableStandards/c051126_ISO_IEC_TR_18037_2008.zip
-  HeaderSpec StdFix = HeaderSpec<
-      "stdfix.h",
-      [],  // macros
-      [IntHrT,IntRT, IntLrT, IntHkT, IntKT, IntLkT, UIntUhrT, UIntUrT, UIntUlrT, UIntUhkT, UIntUkT, UIntUlkT],  // types
-      [],  // enums
-      [    // functions
-          GuardedFunctionSpec<"abshr", RetValSpec<ShortFractType>, [ArgSpec<ShortFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"absr", RetValSpec<FractType>, [ArgSpec<FractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"abslr", RetValSpec<LongFractType>, [ArgSpec<LongFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"abshk", RetValSpec<ShortAccumType>, [ArgSpec<ShortAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"absk", RetValSpec<AccumType>, [ArgSpec<AccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"abslk", RetValSpec<LongAccumType>, [ArgSpec<LongAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"roundhr", RetValSpec<ShortFractType>, [ArgSpec<ShortFractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundr", RetValSpec<FractType>, [ArgSpec<FractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundlr", RetValSpec<LongFractType>, [ArgSpec<LongFractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"roundhk", RetValSpec<ShortAccumType>, [ArgSpec<ShortAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundk", RetValSpec<AccumType>, [ArgSpec<AccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundlk", RetValSpec<LongAccumType>, [ArgSpec<LongAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"rounduhr", RetValSpec<UnsignedShortFractType>, [ArgSpec<UnsignedShortFractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundur", RetValSpec<UnsignedFractType>, [ArgSpec<UnsignedFractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundulr", RetValSpec<UnsignedLongFractType>, [ArgSpec<UnsignedLongFractType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"rounduhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"rounduk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"roundulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-
-          GuardedFunctionSpec<"hrbits", RetValSpec<ShortFractType>, [ArgSpec<IntHrT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"rbits", RetValSpec<FractType>, [ArgSpec<IntRT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"lrbits", RetValSpec<LongFractType>, [ArgSpec<IntLrT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"hkbits", RetValSpec<ShortAccumType>, [ArgSpec<IntHkT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"kbits", RetValSpec<AccumType>, [ArgSpec<IntKT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"lkbits", RetValSpec<LongAccumType>, [ArgSpec<IntLkT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"uhrbits", RetValSpec<UnsignedShortFractType>, [ArgSpec<UIntUhrT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"urbits", RetValSpec<UnsignedFractType>, [ArgSpec<UIntUrT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"ukbits", RetValSpec<UnsignedAccumType>, [ArgSpec<UIntUkT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"ulrbits", RetValSpec<UnsignedLongFractType>, [ArgSpec<UIntUlrT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"uhkbits", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UIntUhkT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-          GuardedFunctionSpec<"ulkbits", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UIntUlkT>], "LIBC_COMPILER_HAS_FIXED_POINT">,
-      ]
-  >;
-
-  let Headers = [
-    StdFix,
-  ];
-}
diff --git a/libc/src/__support/OSUtil/linux/exit.cpp b/libc/src/__support/OSUtil/linux/exit.cpp
index 9c64ce42be185..e26b90f6b18eb 100644
--- a/libc/src/__support/OSUtil/linux/exit.cpp
+++ b/libc/src/__support/OSUtil/linux/exit.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "syscall.h"     // For internal syscall function.
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index 91f6ce8cabd8d..be0f25330af9e 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -15,44 +15,567 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-// ------------------------------------------------------
-// Rationale: Since these classification functions are
-// called in other functions, we will avoid the overhead
-// of a function call by inlining them.
-// ------------------------------------------------------
+// -----------------------------------------------------------------------------
+// ******************                 WARNING                 ******************
+// ****************** DO NOT TRY TO OPTIMIZE THESE FUNCTIONS! ******************
+// -----------------------------------------------------------------------------
+// This switch/case form is easier for the compiler to understand, and is
+// optimized into a form that is almost always the same as or better than
+// versions written by hand (see https://godbolt.org/z/qvrebqvvr). Also this
+// form makes these functions encoding independent. If you want to rewrite these
+// functions, make sure you have benchmarks to show your new solution is faster,
+// as well as a way to support non-ASCII character encodings.
 
-LIBC_INLINE static constexpr bool isalpha(unsigned ch) {
-  return (ch | 32) - 'a' < 26;
+// Similarly, do not change these functions to use case ranges. e.g.
+//  bool islower(int ch) {
+//    switch(ch) {
+//    case 'a'...'z':
+//      return true;
+//    }
+//  }
+// This assumes the character ranges are contiguous, which they aren't in
+// EBCDIC. Technically we could use some smaller ranges, but that's even harder
+// to read.
+
+LIBC_INLINE static constexpr bool islower(int ch) {
+  switch (ch) {
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'e':
+  case 'f':
+  case 'g':
+  case 'h':
+  case 'i':
+  case 'j':
+  case 'k':
+  case 'l':
+  case 'm':
+  case 'n':
+  case 'o':
+  case 'p':
+  case 'q':
+  case 'r':
+  case 's':
+  case 't':
+  case 'u':
+  case 'v':
+  case 'w':
+  case 'x':
+  case 'y':
+  case 'z':
+    return true;
+  default:
+    return false;
+  }
 }
 
-LIBC_INLINE static constexpr bool isdigit(unsigned ch) {
-  return (ch - '0') < 10;
+LIBC_INLINE static constexpr bool isupper(int ch) {
+  switch (ch) {
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'D':
+  case 'E':
+  case 'F':
+  case 'G':
+  case 'H':
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P':
+  case 'Q':
+  case 'R':
+  case 'S':
+  case 'T':
+  case 'U':
+  case 'V':
+  case 'W':
+  case 'X':
+  case 'Y':
+  case 'Z':
+    return true;
+  default:
+    return false;
+  }
 }
 
-LIBC_INLINE static constexpr bool isalnum(unsigned ch) {
-  return isalpha(ch) || isdigit(ch);
+LIBC_INLINE static constexpr bool isdigit(int ch) {
+  switch (ch) {
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return true;
+  default:
+    return false;
+  }
 }
 
-LIBC_INLINE static constexpr bool isgraph(unsigned ch) {
-  return 0x20 < ch && ch < 0x7f;
+LIBC_INLINE static constexpr int tolower(int ch) {
+  switch (ch) {
+  case 'A':
+    return 'a';
+  case 'B':
+    return 'b';
+  case 'C':
+    return 'c';
+  case 'D':
+    return 'd';
+  case 'E':
+    return 'e';
+  case 'F':
+    return 'f';
+  case 'G':
+    return 'g';
+  case 'H':
+    return 'h';
+  case 'I':
+    return 'i';
+  case 'J':
+    return 'j';
+  case 'K':
+    return 'k';
+  case 'L':
+    return 'l';
+  case 'M':
+    return 'm';
+  case 'N':
+    return 'n';
+  case 'O':
+    return 'o';
+  case 'P':
+    return 'p';
+  case 'Q':
+    return 'q';
+  case 'R':
+    return 'r';
+  case 'S':
+    return 's';
+  case 'T':
+    return 't';
+  case 'U':
+    return 'u';
+  case 'V':
+    return 'v';
+  case 'W':
+    return 'w';
+  case 'X':
+    return 'x';
+  case 'Y':
+    return 'y';
+  case 'Z':
+    return 'z';
+  default:
+    return ch;
+  }
 }
 
-LIBC_INLINE static constexpr bool islower(unsigned ch) {
-  return (ch - 'a') < 26;
+LIBC_INLINE static constexpr int toupper(int ch) {
+  switch (ch) {
+  case 'a':
+    return 'A';
+  case 'b':
+    return 'B';
+  case 'c':
+    return 'C';
+  case 'd':
+    return 'D';
+  case 'e':
+    return 'E';
+  case 'f':
+    return 'F';
+  case 'g':
+    return 'G';
+  case 'h':
+    return 'H';
+  case 'i':
+    return 'I';
+  case 'j':
+    return 'J';
+  case 'k':
+    return 'K';
+  case 'l':
+    return 'L';
+  case 'm':
+    return 'M';
+  case 'n':
+    return 'N';
+  case 'o':
+    return 'O';
+  case 'p':
+    return 'P';
+  case 'q':
+    return 'Q';
+  case 'r':
+    return 'R';
+  case 's':
+    return 'S';
+  case 't':
+    return 'T';
+  case 'u':
+    return 'U';
+  case 'v':
+    return 'V';
+  case 'w':
+    return 'W';
+  case 'x':
+    return 'X';
+  case 'y':
+    return 'Y';
+  case 'z':
+    return 'Z';
+  default:
+    return ch;
+  }
 }
 
-LIBC_INLINE static constexpr bool isupper(unsigned ch) {
-  return (ch - 'A') < 26;
+LIBC_INLINE static constexpr bool isalpha(int ch) {
+  switch (ch) {
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'e':
+  case 'f':
+  case 'g':
+  case 'h':
+  case 'i':
+  case 'j':
+  case 'k':
+  case 'l':
+  case 'm':
+  case 'n':
+  case 'o':
+  case 'p':
+  case 'q':
+  case 'r':
+  case 's':
+  case 't':
+  case 'u':
+  case 'v':
+  case 'w':
+  case 'x':
+  case 'y':
+  case 'z':
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'D':
+  case 'E':
+  case 'F':
+  case 'G':
+  case 'H':
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P':
+  case 'Q':
+  case 'R':
+  case 'S':
+  case 'T':
+  case 'U':
+  case 'V':
+  case 'W':
+  case 'X':
+  case 'Y':
+  case 'Z':
+    return true;
+  default:
+    return false;
+  }
 }
 
-LIBC_INLINE static constexpr bool isspace(unsigned ch) {
-  return ch == ' ' || (ch - '\t') < 5;
+LIBC_INLINE static constexpr bool isalnum(int ch) {
+  switch (ch) {
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'e':
+  case 'f':
+  case 'g':
+  case 'h':
+  case 'i':
+  case 'j':
+  case 'k':
+  case 'l':
+  case 'm':
+  case 'n':
+  case 'o':
+  case 'p':
+  case 'q':
+  case 'r':
+  case 's':
+  case 't':
+  case 'u':
+  case 'v':
+  case 'w':
+  case 'x':
+  case 'y':
+  case 'z':
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'D':
+  case 'E':
+  case 'F':
+  case 'G':
+  case 'H':
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P':
+  case 'Q':
+  case 'R':
+  case 'S':
+  case 'T':
+  case 'U':
+  case 'V':
+  case 'W':
+  case 'X':
+  case 'Y':
+  case 'Z':
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return true;
+  default:
+    return false;
+  }
 }
 
-LIBC_INLINE static constexpr int tolower(int ch) {
-  if (isupper(ch))
-    return ch + ('a' - 'A');
-  return ch;
+LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
+  switch (ch) {
+  case '0':
+    return 0;
+  case '1':
+    return 1;
+  case '2':
+    return 2;
+  case '3':
+    return 3;
+  case '4':
+    return 4;
+  case '5':
+    return 5;
+  case '6':
+    return 6;
+  case '7':
+    return 7;
+  case '8':
+    return 8;
+  case '9':
+    return 9;
+  case 'a':
+  case 'A':
+    return 10;
+  case 'b':
+  case 'B':
+    return 11;
+  case 'c':
+  case 'C':
+    return 12;
+  case 'd':
+  case 'D':
+    return 13;
+  case 'e':
+  case 'E':
+    return 14;
+  case 'f':
+  case 'F':
+    return 15;
+  case 'g':
+  case 'G':
+    return 16;
+  case 'h':
+  case 'H':
+    return 17;
+  case 'i':
+  case 'I':
+    return 18;
+  case 'j':
+  case 'J':
+    return 19;
+  case 'k':
+  case 'K':
+    return 20;
+  case 'l':
+  case 'L':
+    return 21;
+  case 'm':
+  case 'M':
+    return 22;
+  case 'n':
+  case 'N':
+    return 23;
+  case 'o':
+  case 'O':
+    return 24;
+  case 'p':
+  case 'P':
+    return 25;
+  case 'q':
+  case 'Q':
+    return 26;
+  case 'r':
+  case 'R':
+    return 27;
+  case 's':
+  case 'S':
+    return 28;
+  case 't':
+  case 'T':
+    return 29;
+  case 'u':
+  case 'U':
+    return 30;
+  case 'v':
+  case 'V':
+    return 31;
+  case 'w':
+  case 'W':
+    return 32;
+  case 'x':
+  case 'X':
+    return 33;
+  case 'y':
+  case 'Y':
+    return 34;
+  case 'z':
+  case 'Z':
+    return 35;
+  default:
+    return 0;
+  }
+}
+
+LIBC_INLINE static constexpr int int_to_b36_char(int num) {
+  // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
+  // which depends on this.
+
+  // LIBC_ASSERT(num < 36);
+  switch (num) {
+  case 0:
+    return '0';
+  case 1:
+    return '1';
+  case 2:
+    return '2';
+  case 3:
+    return '3';
+  case 4:
+    return '4';
+  case 5:
+    return '5';
+  case 6:
+    return '6';
+  case 7:
+    return '7';
+  case 8:
+    return '8';
+  case 9:
+    return '9';
+  case 10:
+    return 'a';
+  case 11:
+    return 'b';
+  case 12:
+    return 'c';
+  case 13:
+    return 'd';
+  case 14:
+    return 'e';
+  case 15:
+    return 'f';
+  case 16:
+    return 'g';
+  case 17:
+    return 'h';
+  case 18:
+    return 'i';
+  case 19:
+    return 'j';
+  case 20:
+    return 'k';
+  case 21:
+    return 'l';
+  case 22:
+    return 'm';
+  case 23:
+    return 'n';
+  case 24:
+    return 'o';
+  case 25:
+    return 'p';
+  case 26:
+    return 'q';
+  case 27:
+    return 'r';
+  case 28:
+    return 's';
+  case 29:
+    return 't';
+  case 30:
+    return 'u';
+  case 31:
+    return 'v';
+  case 32:
+    return 'w';
+  case 33:
+    return 'x';
+  case 34:
+    return 'y';
+  case 35:
+    return 'z';
+  default:
+    return '!';
+  }
+}
+
+LIBC_INLINE static constexpr bool isspace(int ch) {
+  switch (ch) {
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\v':
+  case '\f':
+  case '\r':
+    return true;
+  default:
+    return false;
+  }
+}
+
+// not yet encoding independent.
+LIBC_INLINE static constexpr bool isgraph(int ch) {
+  return 0x20 < ch && ch < 0x7f;
 }
 
 } // namespace internal
diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h
index 20088d6d79791..922dce484aa6b 100644
--- a/libc/src/__support/high_precision_decimal.h
+++ b/libc/src/__support/high_precision_decimal.h
@@ -178,9 +178,11 @@ class HighPrecisionDecimal {
       if (digit_index >= this->num_digits) {
         return new_digits - 1;
       }
-      if (this->digits[digit_index] != power_of_five[digit_index] - '0') {
+      if (this->digits[digit_index] !=
+          internal::b36_char_to_int(power_of_five[digit_index])) {
         return new_digits -
-               ((this->digits[digit_index] < power_of_five[digit_index] - '0')
+               ((this->digits[digit_index] <
+                 internal::b36_char_to_int(power_of_five[digit_index]))
                     ? 1
                     : 0);
       }
@@ -337,8 +339,8 @@ class HighPrecisionDecimal {
         }
         ++total_digits;
         if (this->num_digits < MAX_NUM_DIGITS) {
-          this->digits[this->num_digits] =
-              static_cast<uint8_t>(num_string[num_cur] - '0');
+          this->digits[this->num_digits] = static_cast<uint8_t>(
+              internal::b36_char_to_int(num_string[num_cur]));
           ++this->num_digits;
         } else if (num_string[num_cur] != '0') {
           this->truncated = true;
diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
index 4c5c4c4166681..0298ec7d088d6 100644
--- a/libc/src/__support/integer_literals.h
+++ b/libc/src/__support/integer_literals.h
@@ -13,12 +13,13 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
 #define LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
 
-#include "src/__support/CPP/limits.h"        // CHAR_BIT
+#include "src/__support/CPP/limits.h" // CHAR_BIT
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
-#include "src/__support/uint128.h"           // UInt128
-#include <stddef.h>                          // size_t
-#include <stdint.h>                          // uintxx_t
+#include "src/__support/uint128.h" // UInt128
+#include <stddef.h>                // size_t
+#include <stdint.h>                // uintxx_t
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -75,26 +76,13 @@ template <typename T, int base> struct DigitBuffer {
       push(*str);
   }
 
-  // Returns the digit for a particular character.
-  // Returns INVALID_DIGIT if the character is invalid.
-  LIBC_INLINE static constexpr uint8_t get_digit_value(const char c) {
-    const auto to_lower = [](char c) { return c | 32; };
-    const auto is_digit = [](char c) { return c >= '0' && c <= '9'; };
-    const auto is_alpha = [](char c) {
-      return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
-    };
-    if (is_digit(c))
-      return static_cast<uint8_t>(c - '0');
-    if (base > 10 && is_alpha(c))
-      return static_cast<uint8_t>(to_lower(c) - 'a' + 10);
-    return INVALID_DIGIT;
-  }
-
   // Adds a single character to this buffer.
   LIBC_INLINE constexpr void push(char c) {
     if (c == '\'')
       return; // ' is valid but not taken into account.
-    const uint8_t value = get_digit_value(c);
+    const int b36_val = internal::b36_char_to_int(c);
+    const uint8_t value = static_cast<uint8_t>(
+        b36_val < base && (b36_val != 0 || c == '0') ? b36_val : INVALID_DIGIT);
     if (value == INVALID_DIGIT || size >= MAX_DIGITS) {
       // During constant evaluation `__builtin_unreachable` will halt the
       // compiler as it is not executable. This is preferable over `assert` that
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index 40d45a05ceadb..ea620087584cb 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -69,6 +69,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/big_int.h" // make_integral_or_big_int_unsigned_t
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -214,9 +215,9 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
     using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
-      if (digit < 10)
-        return '0' + static_cast<char>(digit);
-      return (Fmt::IS_UPPERCASE ? 'A' : 'a') + static_cast<char>(digit - 10);
+      const int result = internal::int_to_b36_char(digit);
+      return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
+                                                 : result);
     }
 
     LIBC_INLINE static void
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 80ea334d15c03..b4d5646822df3 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -909,7 +909,7 @@ decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
       cpp::numeric_limits<StorageType>::max() / BASE;
   while (true) {
     if (isdigit(src[index])) {
-      uint32_t digit = src[index] - '0';
+      uint32_t digit = b36_char_to_int(src[index]);
       seen_digit = true;
 
       if (mantissa < bitstype_max_div_by_base) {
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index 86611f9a6902d..8e569e8a7feb0 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -42,14 +42,6 @@ first_non_whitespace(const char *__restrict src,
   return src + src_cur;
 }
 
-LIBC_INLINE int b36_char_to_int(char input) {
-  if (isdigit(input))
-    return input - '0';
-  if (isalpha(input))
-    return (input | 32) + 10 - 'a';
-  return 0;
-}
-
 // checks if the next 3 characters of the string pointer are the start of a
 // hexadecimal number. Does not advance the string pointer.
 LIBC_INLINE bool
@@ -57,7 +49,7 @@ is_hex_start(const char *__restrict src,
              size_t src_len = cpp::numeric_limits<size_t>::max()) {
   if (src_len < 3)
     return false;
-  return *src == '0' && (*(src + 1) | 32) == 'x' && isalnum(*(src + 2)) &&
+  return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) &&
          b36_char_to_int(*(src + 2)) < 16;
 }
 
diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp
index 6b730c354db08..81f645c6f49fc 100644
--- a/libc/src/ctype/isxdigit.cpp
+++ b/libc/src/ctype/isxdigit.cpp
@@ -16,7 +16,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) {
   const unsigned ch = static_cast<unsigned>(c);
-  return static_cast<int>(internal::isdigit(ch) || (ch | 32) - 'a' < 6);
+  return static_cast<int>(internal::isalnum(ch) &&
+                          internal::b36_char_to_int(ch) < 16);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp
index 8a5c7d4d28ab1..eddfd20a2da3b 100644
--- a/libc/src/ctype/isxdigit_l.cpp
+++ b/libc/src/ctype/isxdigit_l.cpp
@@ -16,7 +16,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) {
   const unsigned ch = static_cast<unsigned>(c);
-  return static_cast<int>(internal::isdigit(ch) || (ch | 32) - 'a' < 6);
+  return static_cast<int>(internal::isalnum(ch) &&
+                          internal::b36_char_to_int(ch) < 16);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp
index b5a23fc7f588b..1e1e8fc400711 100644
--- a/libc/src/ctype/toupper.cpp
+++ b/libc/src/ctype/toupper.cpp
@@ -14,10 +14,6 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, toupper, (int c)) {
-  if (internal::islower(c))
-    return c - ('a' - 'A');
-  return c;
-}
+LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp
index f536ff3623616..a435ca1ab5d41 100644
--- a/libc/src/ctype/toupper_l.cpp
+++ b/libc/src/ctype/toupper_l.cpp
@@ -15,9 +15,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) {
-  if (internal::islower(c))
-    return c - ('a' - 'A');
-  return c;
+  return internal::toupper(c);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 76a5e491effa0..390a59d07a28b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -484,6 +484,7 @@ add_math_entrypoint_object(sincosf)
 
 add_math_entrypoint_object(sin)
 add_math_entrypoint_object(sinf)
+add_math_entrypoint_object(sinf16)
 add_math_entrypoint_object(sinpif)
 add_math_entrypoint_object(sinpif16)
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index a5d17ad023f52..aeb758d4a092d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -498,6 +498,27 @@ add_entrypoint_object(
     ${libc_opt_high_flag}
 )
 
+add_entrypoint_object(
+  sinf16
+  SRCS
+    sinf16.cpp
+  HDRS
+    ../sinf16.h
+  DEPENDS
+    .sincosf16_utils
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sincos
   SRCS
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h
index 83511755a56c4..5e5edd4a8c85b 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/math/generic/sincosf16_utils.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
@@ -46,10 +47,31 @@ LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) {
   return static_cast<int32_t>(kf);
 }
 
-LIBC_INLINE void sincospif16_eval(float xf, float &sin_k, float &cos_k,
-                                  float &sin_y, float &cosm1_y) {
-  float y;
-  int32_t k = range_reduction_sincospif16(xf, y);
+// Recall, range reduction:
+//   k = round(x * 32/pi)
+//   y = x * 32/pi - k
+//
+// The constant 0x1.45f306dc9c883p3 is 32/pi rounded to double-precision.
+// 32/pi is generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > round(32/pi, D, RN);
+//
+// The precision choice of 'double' is to minimize rounding errors
+// in this initial scaling step, preserving enough bits so errors accumulated
+// while computing the subtraction: y = x * 32/pi - round(x * 32/pi)
+// are beyond the least-significant bit of single-precision used during
+// further intermediate computation.
+LIBC_INLINE int32_t range_reduction_sincosf16(float x, float &y) {
+  double prod = x * 0x1.45f306dc9c883p3;
+  double kf = fputil::nearest_integer(prod);
+  y = static_cast<float>(prod - kf);
+
+  return static_cast<int32_t>(kf);
+}
+
+static LIBC_INLINE void sincosf16_poly_eval(int32_t k, float y, float &sin_k,
+                                            float &cos_k, float &sin_y,
+                                            float &cosm1_y) {
 
   sin_k = SIN_K_PI_OVER_32[k & 63];
   cos_k = SIN_K_PI_OVER_32[(k + 16) & 63];
@@ -72,6 +94,22 @@ LIBC_INLINE void sincospif16_eval(float xf, float &sin_k, float &cos_k,
                                    0x1.a6f7a2p-29f);
 }
 
+LIBC_INLINE void sincosf16_eval(float xf, float &sin_k, float &cos_k,
+                                float &sin_y, float &cosm1_y) {
+  float y;
+  int32_t k = range_reduction_sincosf16(xf, y);
+
+  sincosf16_poly_eval(k, y, sin_k, cos_k, sin_y, cosm1_y);
+}
+
+LIBC_INLINE void sincospif16_eval(float xf, float &sin_k, float &cos_k,
+                                  float &sin_y, float &cosm1_y) {
+  float y;
+  int32_t k = range_reduction_sincospif16(xf, y);
+
+  sincosf16_poly_eval(k, y, sin_k, cos_k, sin_y, cosm1_y);
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_SINCOSF16_UTILS_H
diff --git a/libc/src/math/generic/sinf16.cpp b/libc/src/math/generic/sinf16.cpp
new file mode 100644
index 0000000000000..86546348ba739
--- /dev/null
+++ b/libc/src/math/generic/sinf16.cpp
@@ -0,0 +1,108 @@
+//===-- Half-precision sin(x) function ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "sincosf16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+constexpr size_t N_EXCEPTS = 4;
+
+constexpr fputil::ExceptValues<float16, N_EXCEPTS> SINF16_EXCEPTS{{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    {0x2b45, 0x2b43, 1, 0, 1},
+    {0x585c, 0x3ba3, 1, 0, 1},
+    {0x5cb0, 0xbbff, 0, 1, 0},
+    {0x51f5, 0xb80f, 0, 1, 0},
+}};
+
+LLVM_LIBC_FUNCTION(float16, sinf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+  float xf = x;
+
+  // Range reduction:
+  // For |x| > pi/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * pi/32
+  //   k is an integer, |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32/pi)
+  //   y = x * 32/pi - k
+  //
+  // Once k and y are computed, we then deduce the answer by the sine of sum
+  // formula:
+  //   sin(x) = sin((k + y) * pi/32)
+  //   	      = sin(k * pi/32) * cos(y * pi/32) +
+  //   	        sin(y * pi/32) * cos(k * pi/32)
+
+  // Handle exceptional values
+  if (LIBC_UNLIKELY(x_abs == 0x585c || x_abs == 0x5cb0 || x_abs == 0x51f5 ||
+                    x_abs == 0x2b45)) {
+    bool x_sign = x_u >> 15;
+    if (auto r = SINF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+        LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+  }
+
+  int rounding = fputil::quick_get_round();
+
+  // Exhaustive tests show that for |x| <= 0x1.f4p-11, 1ULP rounding errors
+  // occur. To fix this, the following apply:
+  if (LIBC_UNLIKELY(x_abs <= 0x13d0)) {
+    // sin(+/-0) = +/-0
+    if (LIBC_UNLIKELY(x_abs == 0U))
+      return x;
+
+    // When x > 0, and rounding upward, sin(x) == x.
+    // When x < 0, and rounding downward, sin(x) == x.
+    if ((rounding == FE_UPWARD && xbits.is_pos()) ||
+        (rounding == FE_DOWNWARD && xbits.is_neg()))
+      return x;
+
+    // When x < 0, and rounding upward, sin(x) == (x - 1ULP)
+    if (rounding == FE_UPWARD && xbits.is_neg()) {
+      x_u--;
+      return FPBits(x_u).get_val();
+    }
+  }
+
+  if (xbits.is_inf_or_nan()) {
+    if (xbits.is_inf()) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+    }
+
+    return x + FPBits::quiet_nan().get_val();
+  }
+
+  float sin_k, cos_k, sin_y, cosm1_y;
+  sincosf16_eval(xf, sin_k, cos_k, sin_y, cosm1_y);
+
+  if (LIBC_UNLIKELY(sin_y == 0 && sin_k == 0))
+    return FPBits::zero(xbits.sign()).get_val();
+
+  // Since, cosm1_y = cos_y - 1, therfore:
+  //    sin(x) = cos_k * sin_y + sin_k + (cosm1_y * sin_k)
+  return fputil::cast<float16>(fputil::multiply_add(
+      sin_y, cos_k, fputil::multiply_add(cosm1_y, sin_k, sin_k)));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp
index ab3c9cb2122ba..67635536ee319 100644
--- a/libc/src/math/generic/tanpif16.cpp
+++ b/libc/src/math/generic/tanpif16.cpp
@@ -21,7 +21,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 constexpr size_t N_EXCEPTS = 21;
 
-constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANF16_EXCEPTS{{
+constexpr fputil::ExceptValues<float16, N_EXCEPTS> TANPIF16_EXCEPTS{{
     // (input, RZ output, RU offset, RD offset, RN offset)
     {0x07f2, 0x0e3d, 1, 0, 0}, {0x086a, 0x0eee, 1, 0, 1},
     {0x08db, 0x0fa0, 1, 0, 0}, {0x094c, 0x1029, 1, 0, 0},
@@ -49,7 +49,7 @@ LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) {
       return x;
 
     bool x_sign = x_u >> 15;
-    if (auto r = TANF16_EXCEPTS.lookup_odd(x_abs, x_sign);
+    if (auto r = TANPIF16_EXCEPTS.lookup_odd(x_abs, x_sign);
         LIBC_UNLIKELY(r.has_value()))
       return r.value();
   }
diff --git a/libc/src/math/sinf16.h b/libc/src/math/sinf16.h
new file mode 100644
index 0000000000000..23f1aa99b6233
--- /dev/null
+++ b/libc/src/math/sinf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for sinf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SINF16_H
+#define LLVM_LIBC_SRC_MATH_SINF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 sinf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SINF16_H
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
index c8812d77b62e3..ba0a62d9fcb87 100644
--- a/libc/src/stdio/printf_core/fixed_converter.h
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -11,6 +11,7 @@
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
 #include "src/__support/integer_to_string.h"
@@ -68,10 +69,6 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
   using LARep = fixed_point::FXRep<unsigned long accum>;
   using StorageType = LARep::StorageType;
 
-  // All of the letters will be defined relative to variable a, which will be
-  // the appropriate case based on the name of the conversion. This converts any
-  // conversion name into the letter 'a' with the appropriate case.
-  const char a = (to_conv.conv_name & 32) | 'A';
   FormatFlags flags = to_conv.flags;
 
   bool is_negative;
@@ -179,9 +176,9 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
     // unspecified.
     RoundDirection round;
     char first_digit_after = fraction_digits[precision];
-    if (first_digit_after > '5') {
+    if (internal::b36_char_to_int(first_digit_after) > 5) {
       round = RoundDirection::Up;
-    } else if (first_digit_after < '5') {
+    } else if (internal::b36_char_to_int(first_digit_after) < 5) {
       round = RoundDirection::Down;
     } else {
       // first_digit_after == '5'
@@ -204,7 +201,8 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
         keep_rounding = false;
         char cur_digit = fraction_digits[digit_to_round];
         // if the digit should not be rounded up
-        if (round == RoundDirection::Even && ((cur_digit - '0') % 2) == 0) {
+        if (round == RoundDirection::Even &&
+            (internal::b36_char_to_int(cur_digit) % 2) == 0) {
           // break out of the loop
           break;
         }
@@ -246,7 +244,7 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
   char sign_char = 0;
 
   // Check if the conv name is uppercase
-  if (a == 'A') {
+  if (internal::isupper(to_conv.conv_name)) {
     // These flags are only for signed conversions, so this removes them if the
     // conversion is unsigned.
     flags = FormatFlags(flags &
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index e39ba6ecea8d4..d93457fcafd7f 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -13,6 +13,7 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/big_int.h" // is_big_int_v
+#include "src/__support/ctype_utils.h"
 #include "src/__support/float_to_string.h"
 #include "src/__support/integer_to_string.h"
 #include "src/__support/libc_assert.h"
@@ -587,8 +588,6 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
   int exponent = float_bits.get_explicit_exponent();
   StorageType mantissa = float_bits.get_explicit_mantissa();
 
-  const char a = (to_conv.conv_name & 32) | 'A';
-
   char sign_char = 0;
 
   if (float_bits.is_neg())
@@ -734,7 +733,8 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
   round = get_round_direction(last_digit, truncated, float_bits.sign());
 
   RET_IF_RESULT_NEGATIVE(float_writer.write_last_block(
-      digits, maximum, round, final_exponent, a + 'E' - 'A'));
+      digits, maximum, round, final_exponent,
+      internal::islower(to_conv.conv_name) ? 'e' : 'E'));
 
   RET_IF_RESULT_NEGATIVE(float_writer.right_pad());
   return WRITE_OK;
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index 0b3ff3dd1cbfd..b264b5cf20728 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -12,6 +12,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -28,10 +29,6 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
                                       const FormatSection &to_conv) {
   using LDBits = fputil::FPBits<long double>;
   using StorageType = LDBits::StorageType;
-  // All of the letters will be defined relative to variable a, which will be
-  // the appropriate case based on the name of the conversion. This converts any
-  // conversion name into the letter 'a' with the appropriate case.
-  const char a = (to_conv.conv_name & 32) | 'A';
 
   bool is_negative;
   int exponent;
@@ -138,9 +135,10 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
   size_t mant_cur = mant_len;
   size_t first_non_zero = 1;
   for (; mant_cur > 0; --mant_cur, mantissa >>= 4) {
-    char mant_mod_16 = static_cast<char>(mantissa) & 15;
-    char new_digit = static_cast<char>(
-        (mant_mod_16 > 9) ? (mant_mod_16 - 10 + a) : (mant_mod_16 + '0'));
+    char mant_mod_16 = static_cast<char>(mantissa % 16);
+    char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16));
+    if (internal::isupper(to_conv.conv_name))
+      new_digit = static_cast<char>(internal::toupper(new_digit));
     mant_buffer[mant_cur - 1] = new_digit;
     if (new_digit != '0' && first_non_zero < mant_cur)
       first_non_zero = mant_cur;
@@ -168,7 +166,8 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
 
   size_t exp_cur = EXP_LEN;
   for (; exponent > 0; --exp_cur, exponent /= 10) {
-    exp_buffer[exp_cur - 1] = static_cast<char>((exponent % 10) + '0');
+    exp_buffer[exp_cur - 1] =
+        static_cast<char>(internal::int_to_b36_char(exponent % 10));
   }
   if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0.
     exp_buffer[EXP_LEN - 1] = '0';
@@ -187,7 +186,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
   constexpr size_t PREFIX_LEN = 2;
   char prefix[PREFIX_LEN];
   prefix[0] = '0';
-  prefix[1] = a + ('x' - 'a');
+  prefix[1] = internal::islower(to_conv.conv_name) ? 'x' : 'X';
   const cpp::string_view prefix_str(prefix, PREFIX_LEN);
 
   // If the precision is greater than the actual result, pad with 0s
@@ -200,7 +199,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
   constexpr cpp::string_view HEXADECIMAL_POINT(".");
 
   // This is for the letter 'p' before the exponent.
-  const char exp_separator = a + ('p' - 'a');
+  const char exp_separator = internal::islower(to_conv.conv_name) ? 'p' : 'P';
   constexpr int EXP_SEPARATOR_LEN = 1;
 
   padding = static_cast<int>(to_conv.min_width - (sign_char > 0 ? 1 : 0) -
diff --git a/libc/src/stdio/printf_core/float_inf_nan_converter.h b/libc/src/stdio/printf_core/float_inf_nan_converter.h
index a7da682b835be..3e41612e21c9f 100644
--- a/libc/src/stdio/printf_core/float_inf_nan_converter.h
+++ b/libc/src/stdio/printf_core/float_inf_nan_converter.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_INF_NAN_CONVERTER_H
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -26,8 +27,6 @@ using StorageType = fputil::FPBits<long double>::StorageType;
 LIBC_INLINE int convert_inf_nan(Writer *writer, const FormatSection &to_conv) {
   // All of the letters will be defined relative to variable a, which will be
   // the appropriate case based on the case of the conversion.
-  const char a = (to_conv.conv_name & 32) | 'A';
-
   bool is_negative;
   StorageType mantissa;
   if (to_conv.length_modifier == LengthModifier::L) {
@@ -66,9 +65,11 @@ LIBC_INLINE int convert_inf_nan(Writer *writer, const FormatSection &to_conv) {
   if (sign_char)
     RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
   if (mantissa == 0) { // inf
-    RET_IF_RESULT_NEGATIVE(writer->write(a == 'a' ? "inf" : "INF"));
+    RET_IF_RESULT_NEGATIVE(
+        writer->write(internal::islower(to_conv.conv_name) ? "inf" : "INF"));
   } else { // nan
-    RET_IF_RESULT_NEGATIVE(writer->write(a == 'a' ? "nan" : "NAN"));
+    RET_IF_RESULT_NEGATIVE(
+        writer->write(internal::islower(to_conv.conv_name) ? "nan" : "NAN"));
   }
 
   if (padding > 0 && ((to_conv.flags & FormatFlags::LEFT_JUSTIFIED) ==
diff --git a/libc/src/stdio/printf_core/int_converter.h b/libc/src/stdio/printf_core/int_converter.h
index f345e86b97a69..d0af229f89be5 100644
--- a/libc/src/stdio/printf_core/int_converter.h
+++ b/libc/src/stdio/printf_core/int_converter.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/CPP/span.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/integer_to_string.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter_utils.h"
@@ -23,11 +24,6 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
-// These functions only work on characters that are already known to be in the
-// alphabet. Their behavior is undefined otherwise.
-LIBC_INLINE constexpr char to_lower(char a) { return a | 32; }
-LIBC_INLINE constexpr bool is_lower(char a) { return (a & 32) > 0; }
-
 namespace details {
 
 using HexFmt = IntegerToString<uintmax_t, radix::Hex>;
@@ -49,14 +45,14 @@ LIBC_INLINE constexpr size_t num_buf_size() {
 
 LIBC_INLINE cpp::optional<cpp::string_view>
 num_to_strview(uintmax_t num, cpp::span<char> bufref, char conv_name) {
-  if (to_lower(conv_name) == 'x') {
-    if (is_lower(conv_name))
+  if (internal::tolower(conv_name) == 'x') {
+    if (internal::islower(conv_name))
       return HexFmt::format_to(bufref, num);
     else
       return HexFmtUppercase::format_to(bufref, num);
   } else if (conv_name == 'o') {
     return OctFmt::format_to(bufref, num);
-  } else if (to_lower(conv_name) == 'b') {
+  } else if (internal::tolower(conv_name) == 'b') {
     return BinFmt::format_to(bufref, num);
   } else {
     return DecFmt::format_to(bufref, num);
@@ -72,7 +68,6 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) {
   uintmax_t num = static_cast<uintmax_t>(to_conv.conv_val_raw);
   bool is_negative = false;
   FormatFlags flags = to_conv.flags;
-  const char a = is_lower(to_conv.conv_name) ? 'a' : 'A';
 
   // If the conversion is signed, then handle negative values.
   if (to_conv.conv_name == 'd' || to_conv.conv_name == 'i') {
@@ -116,16 +111,16 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) {
   // conversions. Since hexadecimal is unsigned these will never conflict.
   size_t prefix_len;
   char prefix[2];
-  if ((to_lower(to_conv.conv_name) == 'x') &&
+  if ((internal::tolower(to_conv.conv_name) == 'x') &&
       ((flags & FormatFlags::ALTERNATE_FORM) != 0) && num != 0) {
     prefix_len = 2;
     prefix[0] = '0';
-    prefix[1] = a + ('x' - 'a');
-  } else if ((to_lower(to_conv.conv_name) == 'b') &&
+    prefix[1] = internal::islower(to_conv.conv_name) ? 'x' : 'X';
+  } else if ((internal::tolower(to_conv.conv_name) == 'b') &&
              ((flags & FormatFlags::ALTERNATE_FORM) != 0) && num != 0) {
     prefix_len = 2;
     prefix[0] = '0';
-    prefix[1] = a + ('b' - 'a');
+    prefix[1] = internal::islower(to_conv.conv_name) ? 'b' : 'B';
   } else {
     prefix_len = (sign_char == 0 ? 0 : 1);
     prefix[0] = sign_char;
diff --git a/libc/src/stdio/scanf_core/converter_utils.h b/libc/src/stdio/scanf_core/converter_utils.h
index 61954556b838a..6f4d16cffb19c 100644
--- a/libc/src/stdio/scanf_core/converter_utils.h
+++ b/libc/src/stdio/scanf_core/converter_utils.h
@@ -19,16 +19,6 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace scanf_core {
 
-LIBC_INLINE constexpr char to_lower(char a) { return a | 32; }
-
-LIBC_INLINE constexpr int b36_char_to_int(char input) {
-  if (internal::isdigit(input))
-    return input - '0';
-  if (internal::isalpha(input))
-    return to_lower(input) + 10 - 'a';
-  return 0;
-}
-
 LIBC_INLINE void write_int_with_length(uintmax_t output_val,
                                        const FormatSection &to_conv) {
   if ((to_conv.flags & NO_WRITE) != 0) {
diff --git a/libc/src/stdio/scanf_core/float_converter.cpp b/libc/src/stdio/scanf_core/float_converter.cpp
index b2d60a249a5a7..9c714d0727214 100644
--- a/libc/src/stdio/scanf_core/float_converter.cpp
+++ b/libc/src/stdio/scanf_core/float_converter.cpp
@@ -55,11 +55,12 @@ int convert_float(Reader *reader, const FormatSection &to_conv) {
 
   // Handle inf
 
-  if (to_lower(cur_char) == inf_string[0]) {
+  if (internal::tolower(cur_char) == inf_string[0]) {
     size_t inf_index = 0;
 
-    for (; inf_index < sizeof(inf_string) && out_str.length() < max_width &&
-           to_lower(cur_char) == inf_string[inf_index];
+    for (;
+         inf_index < (sizeof(inf_string) - 1) && out_str.length() < max_width &&
+         internal::tolower(cur_char) == inf_string[inf_index];
          ++inf_index) {
       if (!out_str.append(cur_char)) {
         return ALLOCATION_FAILURE;
@@ -78,11 +79,12 @@ int convert_float(Reader *reader, const FormatSection &to_conv) {
   static const char nan_string[] = "nan";
 
   // Handle nan
-  if (to_lower(cur_char) == nan_string[0]) {
+  if (internal::tolower(cur_char) == nan_string[0]) {
     size_t nan_index = 0;
 
-    for (; nan_index < sizeof(nan_string) && out_str.length() < max_width &&
-           to_lower(cur_char) == nan_string[nan_index];
+    for (;
+         nan_index < (sizeof(nan_string) - 1) && out_str.length() < max_width &&
+         internal::tolower(cur_char) == nan_string[nan_index];
          ++nan_index) {
       if (!out_str.append(cur_char)) {
         return ALLOCATION_FAILURE;
@@ -117,7 +119,7 @@ int convert_float(Reader *reader, const FormatSection &to_conv) {
     }
 
     // If that next character is an 'x' then this is a hexadecimal number.
-    if (to_lower(cur_char) == 'x') {
+    if (internal::tolower(cur_char) == 'x') {
       base = 16;
 
       if (!out_str.append(cur_char)) {
@@ -163,7 +165,7 @@ int convert_float(Reader *reader, const FormatSection &to_conv) {
 
   // Handle the exponent, which has an exponent mark, an optional sign, and
   // decimal digits.
-  if (to_lower(cur_char) == exponent_mark) {
+  if (internal::tolower(cur_char) == exponent_mark) {
     if (!out_str.append(cur_char)) {
       return ALLOCATION_FAILURE;
     }
diff --git a/libc/src/stdio/scanf_core/int_converter.cpp b/libc/src/stdio/scanf_core/int_converter.cpp
index ecdac52e84bbd..fce817245c010 100644
--- a/libc/src/stdio/scanf_core/int_converter.cpp
+++ b/libc/src/stdio/scanf_core/int_converter.cpp
@@ -80,7 +80,8 @@ int convert_int(Reader *reader, const FormatSection &to_conv) {
     is_signed = true;
   } else if (to_conv.conv_name == 'o') {
     base = 8;
-  } else if (to_lower(to_conv.conv_name) == 'x' || to_conv.conv_name == 'p') {
+  } else if (internal::tolower(to_conv.conv_name) == 'x' ||
+             to_conv.conv_name == 'p') {
     base = 16;
   } else if (to_conv.conv_name == 'd') {
     base = 10;
@@ -122,7 +123,7 @@ int convert_int(Reader *reader, const FormatSection &to_conv) {
         return READ_OK;
       }
 
-      if (to_lower(cur_char) == 'x') {
+      if (internal::tolower(cur_char) == 'x') {
         // This is a valid hex prefix.
 
         is_number = false;
@@ -175,17 +176,18 @@ int convert_int(Reader *reader, const FormatSection &to_conv) {
 
   const uintmax_t max_div_by_base = MAX / base;
 
-  if (internal::isalnum(cur_char) && b36_char_to_int(cur_char) < base) {
+  if (internal::isalnum(cur_char) &&
+      internal::b36_char_to_int(cur_char) < base) {
     is_number = true;
   }
 
   bool has_overflow = false;
   size_t i = 0;
   for (; i < max_width && internal::isalnum(cur_char) &&
-         b36_char_to_int(cur_char) < base;
+         internal::b36_char_to_int(cur_char) < base;
        ++i, cur_char = reader->getc()) {
 
-    uintmax_t cur_digit = b36_char_to_int(cur_char);
+    uintmax_t cur_digit = internal::b36_char_to_int(cur_char);
 
     if (result == MAX) {
       has_overflow = true;
diff --git a/libc/src/stdio/scanf_core/ptr_converter.cpp b/libc/src/stdio/scanf_core/ptr_converter.cpp
index 1a42a389d74b4..37f002d3da698 100644
--- a/libc/src/stdio/scanf_core/ptr_converter.cpp
+++ b/libc/src/stdio/scanf_core/ptr_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "src/stdio/scanf_core/ptr_converter.h"
 
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/scanf_core/converter_utils.h"
 #include "src/stdio/scanf_core/core_structs.h"
@@ -24,7 +25,8 @@ int convert_pointer(Reader *reader, const FormatSection &to_conv) {
   // Check if it's exactly the nullptr string, if so then it's a nullptr.
   char cur_char = reader->getc();
   size_t i = 0;
-  for (; i < sizeof(nullptr_string) && to_lower(cur_char) == nullptr_string[i];
+  for (; i < (sizeof(nullptr_string) - 1) &&
+         internal::tolower(cur_char) == nullptr_string[i];
        ++i) {
     cur_char = reader->getc();
   }
diff --git a/libc/test/UnitTest/MemoryMatcher.cpp b/libc/test/UnitTest/MemoryMatcher.cpp
index 244f25572c378..3cd5174fd7f75 100644
--- a/libc/test/UnitTest/MemoryMatcher.cpp
+++ b/libc/test/UnitTest/MemoryMatcher.cpp
@@ -8,6 +8,7 @@
 
 #include "MemoryMatcher.h"
 
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 #include "test/UnitTest/Test.h"
 
@@ -40,7 +41,8 @@ bool MemoryMatcher::match(MemoryView actualValue) {
 
 static void display(char C) {
   const auto print = [](unsigned char I) {
-    tlog << static_cast<char>(I < 10 ? '0' + I : 'A' + I - 10);
+    tlog << static_cast<char>(LIBC_NAMESPACE::internal::toupper(
+        LIBC_NAMESPACE::internal::int_to_b36_char(I)));
   };
   print(static_cast<unsigned char>(C) / 16);
   print(static_cast<unsigned char>(C) & 15);
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index e121555bd60a9..8ac8f91e98d4c 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -88,81 +88,3 @@ add_subdirectory(locale)
 if(${LIBC_TARGET_OS} STREQUAL "linux")
   add_subdirectory(pthread)
 endif()
-
-if(LLVM_RUNTIMES_BUILD OR LIBC_HDRGEN_EXE)
-  # The public API test below uses tablegen to generate the test
-  # source file. Since tablegen is not available during a runtimes
-  # build, we will skip the test.
-  # If a different libc-hdrgen binary is being used, then also we
-  # skip the api-test as we cannot generate the test source file.
-  return()
-endif()
-
-set(public_test ${CMAKE_CURRENT_BINARY_DIR}/public_api_test.cpp)
-
-set(entrypoints_name_list "")
-foreach(entry IN LISTS TARGET_LLVMLIBC_ENTRYPOINTS)
-  get_target_property(entry_name ${entry} "ENTRYPOINT_NAME")
-  list(APPEND entrypoints_name_list ${entry_name})
-endforeach()
-
-# TODO: Remove these when they are added to the TableGen.
-list(REMOVE_ITEM entrypoints_name_list "__assert_fail" "__errno_location")
-list(TRANSFORM entrypoints_name_list PREPEND "-e=")
-
-file(GLOB spec_files ${LIBC_SOURCE_DIR}/spec/*.td)
-
-# Generate api test souce code.
-add_custom_command(
-  OUTPUT ${public_test}
-  COMMAND $<TARGET_FILE:libc-prototype-testgen> -o ${public_test}
-          ${entrypoints_name_list}
-          -I ${LIBC_SOURCE_DIR}
-          ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td
-
-  DEPENDS ${LIBC_SOURCE_DIR}/config/${LIBC_TARGET_OS}/api.td ${spec_files}
-          libc-prototype-testgen ${TARGET_PUBLIC_HEADERS}
-          ${LIBC_TARGET}
-)
-
-add_custom_target(libc-api-test)
-add_dependencies(check-libc libc-api-test)
-
-set(
-  allocator_entrypoints
-    libc.src.stdlib.malloc
-    libc.src.stdlib.calloc
-    libc.src.stdlib.realloc
-    libc.src.stdlib.aligned_alloc
-    libc.src.stdlib.free
-)
-set(api-test-entrypoints ${TARGET_LLVMLIBC_ENTRYPOINTS})
-list(REMOVE_ITEM api-test-entrypoints ${allocator_entrypoints})
-add_integration_test(
-  api-test
-  SUITE
-    libc-api-test
-  SRCS
-    ${public_test}
-  DEPENDS
-    ${api-test-entrypoints}
-)
-
-if(COMPILER_RESOURCE_DIR AND LLVM_LIBC_ENABLE_LINTING)
-  add_custom_target(
-    libc-api-test-tidy
-    VERBATIM
-    COMMAND ${LLVM_LIBC_CLANG_TIDY} --system-headers
-      --checks=-*,llvmlibc-restrict-system-libc-headers
-      "--extra-arg=-resource-dir=${COMPILER_RESOURCE_DIR}"
-      --header-filter=.*
-      --warnings-as-errors=llvmlibc-*
-      "-config={CheckOptions: [{key: llvmlibc-restrict-system-libc-headers.Includes, value: '-*, linux/*, asm/*.h, asm-generic/*.h'}]}"
-      --quiet
-      -p ${PROJECT_BINARY_DIR}
-      ${public_test}
-    DEPENDS
-      clang-tidy ${public_test}
-  )
-  add_dependencies(libc-api-test libc-api-test-tidy)
-endif()
diff --git a/libc/test/src/__support/CPP/stringview_test.cpp b/libc/test/src/__support/CPP/stringview_test.cpp
index 6b68f2a1c47a9..c9348243745a7 100644
--- a/libc/test/src/__support/CPP/stringview_test.cpp
+++ b/libc/test/src/__support/CPP/stringview_test.cpp
@@ -109,8 +109,6 @@ TEST(LlvmLibcStringViewTest, Observer) {
   ASSERT_EQ(ABC.back(), 'c');
 }
 
-bool isDigit(char c) { return c >= '0' && c <= '9'; }
-
 TEST(LlvmLibcStringViewTest, FindFirstOf) {
   string_view Tmp("abca");
   ASSERT_TRUE(Tmp.find_first_of('a') == 0);
@@ -236,6 +234,9 @@ TEST(LlvmLibcStringViewTest, FindFirstNotOf) {
 
 TEST(LlvmLibcStringViewTest, Contains) {
   string_view Empty;
+  static_assert(
+      'a' < 'z',
+      "This test only supports character encodings where 'a' is below 'z'");
   for (char c = 'a'; c < 'z'; ++c)
     EXPECT_FALSE(Empty.contains(c));
 
diff --git a/libc/test/src/ctype/isalnum_test.cpp b/libc/test/src/ctype/isalnum_test.cpp
index b71d36111d725..18ddd2b14b8c8 100644
--- a/libc/test/src/ctype/isalnum_test.cpp
+++ b/libc/test/src/ctype/isalnum_test.cpp
@@ -6,18 +6,45 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/isalnum.h"
 
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcIsAlNum, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::isalnum('a'), 0);
+  EXPECT_NE(LIBC_NAMESPACE::isalnum('B'), 0);
+  EXPECT_NE(LIBC_NAMESPACE::isalnum('3'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::isalnum(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalnum('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalnum('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalnum(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char ALNUM_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
 TEST(LlvmLibcIsAlNum, DefaultLocale) {
   // Loops through all characters, verifying that numbers and letters
   // return non-zero integer and everything else returns a zero.
-  for (int c = -255; c < 255; ++c) {
-    if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
-        ('0' <= c && c <= '9'))
-      EXPECT_NE(LIBC_NAMESPACE::isalnum(c), 0);
+  for (int ch = -255; ch < 255; ++ch) {
+    if (in_span(ch, ALNUM_ARRAY))
+      EXPECT_NE(LIBC_NAMESPACE::isalnum(ch), 0);
     else
-      EXPECT_EQ(LIBC_NAMESPACE::isalnum(c), 0);
+      EXPECT_EQ(LIBC_NAMESPACE::isalnum(ch), 0);
   }
 }
diff --git a/libc/test/src/ctype/isalpha_test.cpp b/libc/test/src/ctype/isalpha_test.cpp
index 10cdb962ee2ee..e54b580dbe264 100644
--- a/libc/test/src/ctype/isalpha_test.cpp
+++ b/libc/test/src/ctype/isalpha_test.cpp
@@ -6,15 +6,43 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/isalpha.h"
 
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcIsAlpha, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::isalpha('a'), 0);
+  EXPECT_NE(LIBC_NAMESPACE::isalpha('B'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::isalpha('3'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalpha(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalpha('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalpha('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isalpha(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char ALPHA_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
 TEST(LlvmLibcIsAlpha, DefaultLocale) {
   // Loops through all characters, verifying that letters return a
   // non-zero integer and everything else returns zero.
+  // TODO: encoding indep
   for (int ch = -255; ch < 255; ++ch) {
-    if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z'))
+    if (in_span(ch, ALPHA_ARRAY))
       EXPECT_NE(LIBC_NAMESPACE::isalpha(ch), 0);
     else
       EXPECT_EQ(LIBC_NAMESPACE::isalpha(ch), 0);
diff --git a/libc/test/src/ctype/isdigit_test.cpp b/libc/test/src/ctype/isdigit_test.cpp
index a9f84db3ef7e8..adea55e59c74d 100644
--- a/libc/test/src/ctype/isdigit_test.cpp
+++ b/libc/test/src/ctype/isdigit_test.cpp
@@ -6,15 +6,39 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/isdigit.h"
 
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcIsDigit, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::isdigit('3'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit('a'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit('B'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isdigit(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char DIGIT_ARRAY[] = {
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
 TEST(LlvmLibcIsDigit, DefaultLocale) {
-  // Loops through all characters, verifying that numbers return a
-  // non-zero integer and everything else returns zero.
+  // Loops through all characters, verifying that numbers and letters
+  // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
-    if ('0' <= ch && ch <= '9')
+    if (in_span(ch, DIGIT_ARRAY))
       EXPECT_NE(LIBC_NAMESPACE::isdigit(ch), 0);
     else
       EXPECT_EQ(LIBC_NAMESPACE::isdigit(ch), 0);
diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp
index ba7caf65b6fd3..f9414bd8cbd09 100644
--- a/libc/test/src/ctype/islower_test.cpp
+++ b/libc/test/src/ctype/islower_test.cpp
@@ -6,14 +6,40 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/islower.h"
+
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcIsLower, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::islower('a'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::islower('B'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::islower('3'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::islower(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::islower('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::islower('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::islower(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char LOWER_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
 TEST(LlvmLibcIsLower, DefaultLocale) {
-  // Loops through all characters, verifying that lowercase letters
-  // return a non-zero integer and everything else returns zero.
+  // Loops through all characters, verifying that numbers and letters
+  // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
-    if ('a' <= ch && ch <= 'z')
+    if (in_span(ch, LOWER_ARRAY))
       EXPECT_NE(LIBC_NAMESPACE::islower(ch), 0);
     else
       EXPECT_EQ(LIBC_NAMESPACE::islower(ch), 0);
diff --git a/libc/test/src/ctype/isupper_test.cpp b/libc/test/src/ctype/isupper_test.cpp
index 05b2fd069ef06..94def1a9dcccd 100644
--- a/libc/test/src/ctype/isupper_test.cpp
+++ b/libc/test/src/ctype/isupper_test.cpp
@@ -6,14 +6,40 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/isupper.h"
+
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcIsUpper, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::isupper('B'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::isupper('a'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isupper('3'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isupper(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isupper('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isupper('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isupper(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char UPPER_ARRAY[] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
 TEST(LlvmLibcIsUpper, DefaultLocale) {
-  // Loops through all characters, verifying that uppercase letters
-  // return a non-zero integer and everything else returns zero.
+  // Loops through all characters, verifying that numbers and letters
+  // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
-    if ('A' <= ch && ch <= 'Z')
+    if (in_span(ch, UPPER_ARRAY))
       EXPECT_NE(LIBC_NAMESPACE::isupper(ch), 0);
     else
       EXPECT_EQ(LIBC_NAMESPACE::isupper(ch), 0);
diff --git a/libc/test/src/ctype/isxdigit_test.cpp b/libc/test/src/ctype/isxdigit_test.cpp
index b8f27a968540c..d7253d549907b 100644
--- a/libc/test/src/ctype/isxdigit_test.cpp
+++ b/libc/test/src/ctype/isxdigit_test.cpp
@@ -6,13 +6,41 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/isxdigit.h"
+
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcIsXDigit, DefaultLocale) {
+TEST(LlvmLibcIsXdigit, SimpleTest) {
+  EXPECT_NE(LIBC_NAMESPACE::isxdigit('a'), 0);
+  EXPECT_NE(LIBC_NAMESPACE::isxdigit('B'), 0);
+  EXPECT_NE(LIBC_NAMESPACE::isxdigit('3'), 0);
+
+  EXPECT_EQ(LIBC_NAMESPACE::isxdigit('z'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isxdigit(' '), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isxdigit('?'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isxdigit('\0'), 0);
+  EXPECT_EQ(LIBC_NAMESPACE::isxdigit(-1), 0);
+}
+
+// TODO: Merge the ctype tests using this framework.
+constexpr char XDIGIT_ARRAY[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E',
+    'F', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+};
+
+bool in_span(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return true;
+  return false;
+}
+
+TEST(LlvmLibcIsXdigit, DefaultLocale) {
+  // Loops through all characters, verifying that numbers and letters
+  // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
-    if (('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') ||
-        ('A' <= ch && ch <= 'F'))
+    if (in_span(ch, XDIGIT_ARRAY))
       EXPECT_NE(LIBC_NAMESPACE::isxdigit(ch), 0);
     else
       EXPECT_EQ(LIBC_NAMESPACE::isxdigit(ch), 0);
diff --git a/libc/test/src/ctype/tolower_test.cpp b/libc/test/src/ctype/tolower_test.cpp
index 3770ce4ea68b6..59432c43297b3 100644
--- a/libc/test/src/ctype/tolower_test.cpp
+++ b/libc/test/src/ctype/tolower_test.cpp
@@ -6,14 +6,51 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/tolower.h"
+
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcToLower, SimpleTest) {
+  EXPECT_EQ(LIBC_NAMESPACE::tolower('a'), int('a'));
+  EXPECT_EQ(LIBC_NAMESPACE::tolower('B'), int('b'));
+  EXPECT_EQ(LIBC_NAMESPACE::tolower('3'), int('3'));
+
+  EXPECT_EQ(LIBC_NAMESPACE::tolower(' '), int(' '));
+  EXPECT_EQ(LIBC_NAMESPACE::tolower('?'), int('?'));
+  EXPECT_EQ(LIBC_NAMESPACE::tolower('\0'), int('\0'));
+  EXPECT_EQ(LIBC_NAMESPACE::tolower(-1), int(-1));
+}
+
+// TODO: Merge the ctype tests using this framework.
+// Invariant: UPPER_ARR and LOWER_ARR are both the complete alphabet in the same
+// order.
+constexpr char UPPER_ARR[] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+constexpr char LOWER_ARR[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+};
+
+static_assert(
+    sizeof(UPPER_ARR) == sizeof(LOWER_ARR),
+    "There must be the same number of uppercase and lowercase letters.");
+
+int span_index(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return static_cast<int>(i);
+  return -1;
+}
+
 TEST(LlvmLibcToLower, DefaultLocale) {
   for (int ch = -255; ch < 255; ++ch) {
-    // This follows pattern 'A' + 32 = 'a'.
-    if ('A' <= ch && ch <= 'Z')
-      EXPECT_EQ(LIBC_NAMESPACE::tolower(ch), ch + 32);
+    int char_index = span_index(ch, UPPER_ARR);
+    if (char_index != -1)
+      EXPECT_EQ(LIBC_NAMESPACE::tolower(ch),
+                static_cast<int>(LOWER_ARR[char_index]));
     else
       EXPECT_EQ(LIBC_NAMESPACE::tolower(ch), ch);
   }
diff --git a/libc/test/src/ctype/toupper_test.cpp b/libc/test/src/ctype/toupper_test.cpp
index 0413b43fb6009..045b00bbb4b93 100644
--- a/libc/test/src/ctype/toupper_test.cpp
+++ b/libc/test/src/ctype/toupper_test.cpp
@@ -6,14 +6,51 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/CPP/span.h"
 #include "src/ctype/toupper.h"
+
 #include "test/UnitTest/Test.h"
 
+TEST(LlvmLibcToUpper, SimpleTest) {
+  EXPECT_EQ(LIBC_NAMESPACE::toupper('a'), int('A'));
+  EXPECT_EQ(LIBC_NAMESPACE::toupper('B'), int('B'));
+  EXPECT_EQ(LIBC_NAMESPACE::toupper('3'), int('3'));
+
+  EXPECT_EQ(LIBC_NAMESPACE::toupper(' '), int(' '));
+  EXPECT_EQ(LIBC_NAMESPACE::toupper('?'), int('?'));
+  EXPECT_EQ(LIBC_NAMESPACE::toupper('\0'), int('\0'));
+  EXPECT_EQ(LIBC_NAMESPACE::toupper(-1), int(-1));
+}
+
+// TODO: Merge the ctype tests using this framework.
+// Invariant: UPPER_ARR and LOWER_ARR are both the complete alphabet in the same
+// order.
+constexpr char UPPER_ARR[] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+    'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+};
+constexpr char LOWER_ARR[] = {
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
+    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+};
+
+static_assert(
+    sizeof(UPPER_ARR) == sizeof(LOWER_ARR),
+    "There must be the same number of uppercase and lowercase letters.");
+
+int span_index(int ch, LIBC_NAMESPACE::cpp::span<const char> arr) {
+  for (size_t i = 0; i < arr.size(); ++i)
+    if (static_cast<int>(arr[i]) == ch)
+      return static_cast<int>(i);
+  return -1;
+}
+
 TEST(LlvmLibcToUpper, DefaultLocale) {
   for (int ch = -255; ch < 255; ++ch) {
-    // This follows pattern 'a' - 32 = 'A'.
-    if ('a' <= ch && ch <= 'z')
-      EXPECT_EQ(LIBC_NAMESPACE::toupper(ch), ch - 32);
+    int char_index = span_index(ch, LOWER_ARR);
+    if (char_index != -1)
+      EXPECT_EQ(LIBC_NAMESPACE::toupper(ch),
+                static_cast<int>(UPPER_ARR[char_index]));
     else
       EXPECT_EQ(LIBC_NAMESPACE::toupper(ch), ch);
   }
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 610f4d9fc1a3b..ea75720df4f43 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -85,6 +85,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    sinf16_test.cpp
+  DEPENDS
+    libc.src.math.sinf16  
+)
+
 add_fp_unittest(
   sinpif_test
   NEED_MPFR
diff --git a/libc/test/src/math/sinf16_test.cpp b/libc/test/src/math/sinf16_test.cpp
new file mode 100644
index 0000000000000..b05501cb0f145
--- /dev/null
+++ b/libc/test/src/math/sinf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for sinf16 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf]
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0]
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcSinf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
+                                   LIBC_NAMESPACE::sinf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcSinf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
+                                   LIBC_NAMESPACE::sinf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index e9c785f7d9330..2c1c4dba73846 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -49,6 +49,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sinf16_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.sinf16
+)
+
 add_fp_unittest(
   sinpif_test
   SUITE
diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp
new file mode 100644
index 0000000000000..2966c3c952fd2
--- /dev/null
+++ b/libc/test/src/math/smoke/sinf16_test.cpp
@@ -0,0 +1,33 @@
+//===-- Unittests for sinf16 ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/sinf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcSinf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf16(inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf16(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+}
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 8a67848e4c330..6cfaddcbedeb6 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -8,6 +8,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/errno/libc_errno.h"
 #include "test/UnitTest/Test.h"
@@ -16,14 +17,6 @@
 
 using LIBC_NAMESPACE::cpp::is_signed_v;
 
-static inline char int_to_b36_char(int input) {
-  if (input < 0 || input > 36)
-    return '0';
-  if (input < 10)
-    return static_cast<char>('0' + input);
-  return static_cast<char>('A' + input - 10);
-}
-
 template <typename ReturnT>
 struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
   using FunctionT = ReturnT (*)(const char *, char **, int);
@@ -207,7 +200,8 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char small_string[4] = {'\0', '\0', '\0', '\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = int_to_b36_char(first_digit);
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         if (first_digit < base) {
           LIBC_NAMESPACE::libc_errno = 0;
           ASSERT_EQ(func(small_string, nullptr, base),
@@ -223,9 +217,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = int_to_b36_char(first_digit);
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = int_to_b36_char(second_digit);
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           if (first_digit < base && second_digit < base) {
             LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(
@@ -248,11 +244,14 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = int_to_b36_char(first_digit);
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = int_to_b36_char(second_digit);
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = int_to_b36_char(third_digit);
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/libc/test/src/string/strcmp_test.cpp b/libc/test/src/string/strcmp_test.cpp
index ef58dc608c83b..234447610222f 100644
--- a/libc/test/src/string/strcmp_test.cpp
+++ b/libc/test/src/string/strcmp_test.cpp
@@ -25,13 +25,13 @@ TEST(LlvmLibcStrCmpTest, EmptyStringShouldNotEqualNonEmptyString) {
   const char *s2 = "abc";
   int result = LIBC_NAMESPACE::strcmp(empty, s2);
   // This should be '\0' - 'a' = -97
-  ASSERT_EQ(result, -97);
+  ASSERT_EQ(result, '\0' - 'a');
 
   // Similar case if empty string is second argument.
   const char *s3 = "123";
   result = LIBC_NAMESPACE::strcmp(s3, empty);
   // This should be '1' - '\0' = 49
-  ASSERT_EQ(result, 49);
+  ASSERT_EQ(result, '1' - '\0');
 }
 
 TEST(LlvmLibcStrCmpTest, EqualStringsShouldReturnZero) {
@@ -50,12 +50,12 @@ TEST(LlvmLibcStrCmpTest, ShouldReturnResultOfFirstDifference) {
   const char *s2 = "___C55__";
   int result = LIBC_NAMESPACE::strcmp(s1, s2);
   // This should return 'B' - 'C' = -1.
-  ASSERT_EQ(result, -1);
+  ASSERT_EQ(result, 'B' - 'C');
 
   // Verify operands reversed.
   result = LIBC_NAMESPACE::strcmp(s2, s1);
   // This should return 'C' - 'B' = 1.
-  ASSERT_EQ(result, 1);
+  ASSERT_EQ(result, 'C' - 'B');
 }
 
 TEST(LlvmLibcStrCmpTest, CapitalizedLetterShouldNotBeEqual) {
@@ -63,12 +63,12 @@ TEST(LlvmLibcStrCmpTest, CapitalizedLetterShouldNotBeEqual) {
   const char *s2 = "abCd";
   int result = LIBC_NAMESPACE::strcmp(s1, s2);
   // 'c' - 'C' = 32.
-  ASSERT_EQ(result, 32);
+  ASSERT_EQ(result, 'c' - 'C');
 
   // Verify operands reversed.
   result = LIBC_NAMESPACE::strcmp(s2, s1);
   // 'C' - 'c' = -32.
-  ASSERT_EQ(result, -32);
+  ASSERT_EQ(result, 'C' - 'c');
 }
 
 TEST(LlvmLibcStrCmpTest, UnequalLengthStringsShouldNotReturnZero) {
@@ -76,12 +76,12 @@ TEST(LlvmLibcStrCmpTest, UnequalLengthStringsShouldNotReturnZero) {
   const char *s2 = "abcd";
   int result = LIBC_NAMESPACE::strcmp(s1, s2);
   // '\0' - 'd' = -100.
-  ASSERT_EQ(result, -100);
+  ASSERT_EQ(result, -'\0' - 'd');
 
   // Verify operands reversed.
   result = LIBC_NAMESPACE::strcmp(s2, s1);
   // 'd' - '\0' = 100.
-  ASSERT_EQ(result, 100);
+  ASSERT_EQ(result, 'd' - '\0');
 }
 
 TEST(LlvmLibcStrCmpTest, StringArgumentSwapChangesSign) {
@@ -89,11 +89,11 @@ TEST(LlvmLibcStrCmpTest, StringArgumentSwapChangesSign) {
   const char *b = "b";
   int result = LIBC_NAMESPACE::strcmp(b, a);
   // 'b' - 'a' = 1.
-  ASSERT_EQ(result, 1);
+  ASSERT_EQ(result, 'b' - 'a');
 
   result = LIBC_NAMESPACE::strcmp(a, b);
   // 'a' - 'b' = -1.
-  ASSERT_EQ(result, -1);
+  ASSERT_EQ(result, 'a' - 'b');
 }
 
 TEST(LlvmLibcStrCmpTest, Case) {
diff --git a/libc/utils/HdrGen/CMakeLists.txt b/libc/utils/HdrGen/CMakeLists.txt
deleted file mode 100644
index 47f845b9f9a5b..0000000000000
--- a/libc/utils/HdrGen/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-include(TableGen)
-
-if (NOT LLVM_LINK_LLVM_DYLIB)
-  set(LLVM_LINK_COMPONENTS Support)
-endif()
-
-add_tablegen(libc-hdrgen LIBC
-  Command.h
-  Command.cpp
-  Generator.cpp
-  Generator.h
-  IncludeFileCommand.cpp
-  IncludeFileCommand.h
-  Main.cpp
-  PublicAPICommand.cpp
-  PublicAPICommand.h
-)
-
-target_include_directories(libc-hdrgen PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
-target_link_libraries(libc-hdrgen PRIVATE LibcTableGenUtil)
-
-add_subdirectory(PrototypeTestGen)
diff --git a/libc/utils/HdrGen/Command.cpp b/libc/utils/HdrGen/Command.cpp
deleted file mode 100644
index 04462c18aac3f..0000000000000
--- a/libc/utils/HdrGen/Command.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-//===-- Base class for header generation commands -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Command.h"
-
-namespace llvm_libc {
-
-Command::~Command() {}
-
-} // namespace llvm_libc
diff --git a/libc/utils/HdrGen/Command.h b/libc/utils/HdrGen/Command.h
deleted file mode 100644
index 42516798ffaec..0000000000000
--- a/libc/utils/HdrGen/Command.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- Base class for header generation commands ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UTILS_HDRGEN_COMMAND_H
-#define LLVM_LIBC_UTILS_HDRGEN_COMMAND_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/SourceMgr.h"
-
-#include <cstdlib>
-
-namespace llvm {
-
-class raw_ostream;
-class RecordKeeper;
-
-} // namespace llvm
-
-namespace llvm_libc {
-
-typedef llvm::SmallVector<llvm::StringRef, 4> ArgVector;
-
-class Command {
-public:
-  class ErrorReporter {
-    llvm::SMLoc Loc;
-    const llvm::SourceMgr &SrcMgr;
-
-  public:
-    ErrorReporter(llvm::SMLoc L, llvm::SourceMgr &SM) : Loc(L), SrcMgr(SM) {}
-
-    [[noreturn]] void printFatalError(llvm::Twine Msg) const {
-      SrcMgr.PrintMessage(Loc, llvm::SourceMgr::DK_Error, Msg);
-      std::exit(1);
-    }
-  };
-
-  virtual ~Command();
-
-  virtual void run(llvm::raw_ostream &OS, const ArgVector &Args,
-                   llvm::StringRef StdHeader, const llvm::RecordKeeper &Records,
-                   const ErrorReporter &Reporter) const = 0;
-};
-
-} // namespace llvm_libc
-
-#endif // LLVM_LIBC_UTILS_HDRGEN_COMMAND_H
diff --git a/libc/utils/HdrGen/Generator.cpp b/libc/utils/HdrGen/Generator.cpp
deleted file mode 100644
index b4e1166b1a37b..0000000000000
--- a/libc/utils/HdrGen/Generator.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-//===-- Implementation of the main header generation class ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Generator.h"
-
-#include "IncludeFileCommand.h"
-#include "PublicAPICommand.h"
-#include "utils/LibcTableGenUtil/APIIndexer.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstdlib>
-#include <memory>
-
-static const char CommandPrefix[] = "%%";
-static const size_t CommandPrefixSize = llvm::StringRef(CommandPrefix).size();
-
-static const char CommentPrefix[] = "<!>";
-
-static const char ParamNamePrefix[] = "${";
-static const size_t ParamNamePrefixSize =
-    llvm::StringRef(ParamNamePrefix).size();
-static const char ParamNameSuffix[] = "}";
-static const size_t ParamNameSuffixSize =
-    llvm::StringRef(ParamNameSuffix).size();
-
-namespace llvm_libc {
-
-Command *Generator::getCommandHandler(llvm::StringRef CommandName) {
-  if (CommandName == IncludeFileCommand::Name) {
-    if (!IncludeFileCmd)
-      IncludeFileCmd = std::make_unique<IncludeFileCommand>();
-    return IncludeFileCmd.get();
-  } else if (CommandName == PublicAPICommand::Name) {
-    if (!PublicAPICmd)
-      PublicAPICmd = std::make_unique<PublicAPICommand>(EntrypointNameList);
-    return PublicAPICmd.get();
-  } else {
-    return nullptr;
-  }
-}
-
-void Generator::parseCommandArgs(llvm::StringRef ArgStr, ArgVector &Args) {
-  if (!ArgStr.contains(',') && ArgStr.trim(' ').trim('\t').size() == 0) {
-    // If it is just space between the parenthesis
-    return;
-  }
-
-  ArgStr.split(Args, ",");
-  for (llvm::StringRef &A : Args) {
-    A = A.trim(' ');
-    if (A.starts_with(ParamNamePrefix) && A.ends_with(ParamNameSuffix)) {
-      A = A.drop_front(ParamNamePrefixSize).drop_back(ParamNameSuffixSize);
-      A = ArgMap[std::string(A)];
-    }
-  }
-}
-
-void Generator::generate(llvm::raw_ostream &OS,
-                         const llvm::RecordKeeper &Records) {
-  auto DefFileBuffer = llvm::MemoryBuffer::getFile(HeaderDefFile);
-  if (!DefFileBuffer) {
-    llvm::errs() << "Unable to open " << HeaderDefFile << ".\n";
-    std::exit(1);
-  }
-  llvm::SourceMgr SrcMgr;
-  unsigned DefFileID = SrcMgr.AddNewSourceBuffer(
-      std::move(DefFileBuffer.get()), llvm::SMLoc::getFromPointer(nullptr));
-
-  llvm::StringRef Content = SrcMgr.getMemoryBuffer(DefFileID)->getBuffer();
-  while (true) {
-    std::pair<llvm::StringRef, llvm::StringRef> P = Content.split('\n');
-    Content = P.second;
-
-    llvm::StringRef Line = P.first.trim(' ');
-    if (Line.starts_with(CommandPrefix)) {
-      Line = Line.drop_front(CommandPrefixSize);
-
-      P = Line.split("(");
-      // It's possible that we have windows line endings, so strip off the extra
-      // CR.
-      P.second = P.second.trim();
-      if (P.second.empty() || P.second[P.second.size() - 1] != ')') {
-        SrcMgr.PrintMessage(llvm::SMLoc::getFromPointer(P.second.data()),
-                            llvm::SourceMgr::DK_Error,
-                            "Command argument list should begin with '(' "
-                            "and end with ')'.");
-        SrcMgr.PrintMessage(llvm::SMLoc::getFromPointer(P.second.data()),
-                            llvm::SourceMgr::DK_Error, P.second.data());
-        SrcMgr.PrintMessage(llvm::SMLoc::getFromPointer(P.second.data()),
-                            llvm::SourceMgr::DK_Error,
-                            std::to_string(P.second.size()));
-        std::exit(1);
-      }
-      llvm::StringRef CommandName = P.first;
-      Command *Cmd = getCommandHandler(CommandName);
-      if (Cmd == nullptr) {
-        SrcMgr.PrintMessage(llvm::SMLoc::getFromPointer(CommandName.data()),
-                            llvm::SourceMgr::DK_Error,
-                            "Unknown command '%%" + CommandName + "'.");
-        std::exit(1);
-      }
-
-      llvm::StringRef ArgStr = P.second.drop_back(1);
-      ArgVector Args;
-      parseCommandArgs(ArgStr, Args);
-
-      Command::ErrorReporter Reporter(
-          llvm::SMLoc::getFromPointer(CommandName.data()), SrcMgr);
-      Cmd->run(OS, Args, StdHeader, Records, Reporter);
-    } else if (!Line.starts_with(CommentPrefix)) {
-      // There is no comment or command on this line so we just write it as is.
-      OS << P.first << "\n";
-    }
-
-    if (P.second.empty())
-      break;
-  }
-}
-
-void Generator::generateDecls(llvm::raw_ostream &OS,
-                              const llvm::RecordKeeper &Records) {
-
-  OS << "//===-- C standard declarations for " << StdHeader << " "
-     << std::string(80 - (42 + StdHeader.size()), '-') << "===//\n"
-     << "//\n"
-     << "// Part of the LLVM Project, under the Apache License v2.0 with LLVM "
-        "Exceptions.\n"
-     << "// See https://llvm.org/LICENSE.txt for license information.\n"
-     << "// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n"
-     << "//\n"
-     << "//"
-        "===-------------------------------------------------------------------"
-        "---===//\n\n";
-
-  std::string HeaderGuard(StdHeader.size(), '\0');
-  llvm::transform(StdHeader, HeaderGuard.begin(), [](const char C) -> char {
-    return !isalnum(C) ? '_' : llvm::toUpper(C);
-  });
-  OS << "#ifndef __LLVM_LIBC_DECLARATIONS_" << HeaderGuard << "\n"
-     << "#define __LLVM_LIBC_DECLARATIONS_" << HeaderGuard << "\n\n";
-
-  OS << "#ifndef __LIBC_ATTRS\n"
-     << "#define __LIBC_ATTRS\n"
-     << "#endif\n\n";
-
-  OS << "#ifdef __cplusplus\n"
-     << "extern \"C\" {\n"
-     << "#endif\n\n";
-
-  APIIndexer G(StdHeader, Records);
-  for (auto &Name : EntrypointNameList) {
-    // Filter out functions not exported by this header.
-    if (G.FunctionSpecMap.find(Name) == G.FunctionSpecMap.end())
-      continue;
-
-    const llvm::Record *FunctionSpec = G.FunctionSpecMap[Name];
-    const llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return");
-    const llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType");
-
-    OS << G.getTypeAsString(ReturnType) << " " << Name << "(";
-
-    auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args");
-    for (size_t i = 0; i < ArgsList.size(); ++i) {
-      const llvm::Record *ArgType = ArgsList[i]->getValueAsDef("ArgType");
-      OS << G.getTypeAsString(ArgType);
-      if (i < ArgsList.size() - 1)
-        OS << ", ";
-    }
-
-    OS << ") __LIBC_ATTRS;\n\n";
-  }
-
-  // Make another pass over entrypoints to emit object declarations.
-  for (const auto &Name : EntrypointNameList) {
-    if (G.ObjectSpecMap.find(Name) == G.ObjectSpecMap.end())
-      continue;
-    const llvm::Record *ObjectSpec = G.ObjectSpecMap[Name];
-    auto Type = ObjectSpec->getValueAsString("Type");
-    OS << "extern " << Type << " " << Name << " __LIBC_ATTRS;\n";
-  }
-
-  // Emit a final newline if we emitted any object declarations.
-  if (llvm::any_of(EntrypointNameList, [&](const std::string &Name) {
-        return G.ObjectSpecMap.find(Name) != G.ObjectSpecMap.end();
-      }))
-    OS << "\n";
-
-  OS << "#ifdef __cplusplus\n"
-     << "}\n"
-     << "#endif\n\n";
-  OS << "#endif\n";
-}
-
-} // namespace llvm_libc
diff --git a/libc/utils/HdrGen/Generator.h b/libc/utils/HdrGen/Generator.h
deleted file mode 100644
index 1c149d2115640..0000000000000
--- a/libc/utils/HdrGen/Generator.h
+++ /dev/null
@@ -1,60 +0,0 @@
-//===-- The main header generation class ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UTILS_HDRGEN_GENERATOR_H
-#define LLVM_LIBC_UTILS_HDRGEN_GENERATOR_H
-
-#include "Command.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace llvm {
-
-class raw_ostream;
-class RecordKeeper;
-
-} // namespace llvm
-
-namespace llvm_libc {
-
-class Command;
-
-class Generator {
-  llvm::StringRef HeaderDefFile;
-  const std::vector<std::string> &EntrypointNameList;
-  llvm::StringRef StdHeader;
-  std::unordered_map<std::string, std::string> &ArgMap;
-
-  std::unique_ptr<Command> IncludeFileCmd;
-  std::unique_ptr<Command> PublicAPICmd;
-
-  Command *getCommandHandler(llvm::StringRef CommandName);
-
-  void parseCommandArgs(llvm::StringRef ArgStr, ArgVector &Args);
-
-  void printError(llvm::StringRef Msg);
-
-public:
-  Generator(const std::string &DefFile, const std::vector<std::string> &EN,
-            const std::string &Header,
-            std::unordered_map<std::string, std::string> &Map)
-      : HeaderDefFile(DefFile), EntrypointNameList(EN), StdHeader(Header),
-        ArgMap(Map) {}
-
-  void generate(llvm::raw_ostream &OS, const llvm::RecordKeeper &Records);
-  void generateDecls(llvm::raw_ostream &OS, const llvm::RecordKeeper &Records);
-};
-
-} // namespace llvm_libc
-
-#endif // LLVM_LIBC_UTILS_HDRGEN_GENERATOR_H
diff --git a/libc/utils/HdrGen/IncludeFileCommand.cpp b/libc/utils/HdrGen/IncludeFileCommand.cpp
deleted file mode 100644
index 43efe43585eb2..0000000000000
--- a/libc/utils/HdrGen/IncludeFileCommand.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- Implementation of IncludeFileCommand ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "IncludeFileCommand.h"
-
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-
-#include <cstdlib>
-
-namespace llvm_libc {
-
-const char IncludeFileCommand::Name[] = "include_file";
-
-void IncludeFileCommand::run(llvm::raw_ostream &OS, const ArgVector &Args,
-                             llvm::StringRef StdHeader,
-                             const llvm::RecordKeeper &Records,
-                             const Command::ErrorReporter &Reporter) const {
-  if (Args.size() != 1) {
-    Reporter.printFatalError(
-        "%%include_file command takes exactly 1 argument.");
-  }
-
-  llvm::StringRef IncludeFile = Args[0];
-  auto Buffer = llvm::MemoryBuffer::getFileAsStream(IncludeFile);
-  if (!Buffer)
-    Reporter.printFatalError(llvm::StringRef("Unable to open ") + IncludeFile);
-
-  llvm::StringRef Content = Buffer.get()->getBuffer();
-
-  // If the included file has %%begin() command listed, then we want to write
-  // only the content after the begin command.
-  // TODO: The way the content is split below does not allow space within the
-  // the parentheses and, before and after the command. This probably is too
-  // strict and should be relaxed.
-  auto P = Content.split("\n%%begin()\n");
-  if (P.second.empty()) {
-    // There was no %%begin in the content.
-    OS << P.first;
-  } else {
-    OS << P.second;
-  }
-}
-
-} // namespace llvm_libc
diff --git a/libc/utils/HdrGen/IncludeFileCommand.h b/libc/utils/HdrGen/IncludeFileCommand.h
deleted file mode 100644
index b3a6ff5462ded..0000000000000
--- a/libc/utils/HdrGen/IncludeFileCommand.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- Class which implements the %%include_file command -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UTILS_HDRGEN_INCLUDE_COMMAND_H
-#define LLVM_LIBC_UTILS_HDRGEN_INCLUDE_COMMAND_H
-
-#include "Command.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-
-#include <string>
-
-namespace llvm_libc {
-
-class IncludeFileCommand : public Command {
-public:
-  static const char Name[];
-
-  void run(llvm::raw_ostream &OS, const ArgVector &Args,
-           llvm::StringRef StdHeader, const llvm::RecordKeeper &Records,
-           const Command::ErrorReporter &Reporter) const override;
-};
-
-} // namespace llvm_libc
-
-#endif // LLVM_LIBC_UTILS_HDRGEN_INCLUDE_COMMAND_H
diff --git a/libc/utils/HdrGen/Main.cpp b/libc/utils/HdrGen/Main.cpp
deleted file mode 100644
index f795e96e45c57..0000000000000
--- a/libc/utils/HdrGen/Main.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- "main" function of libc-hdrgen ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Generator.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/TableGen/Main.h"
-
-#include <string>
-#include <unordered_map>
-
-static llvm::cl::opt<std::string>
-    HeaderDefFile("def", llvm::cl::desc("Path to the .h.def file."),
-                  llvm::cl::value_desc("<filename>"), llvm::cl::Required);
-static llvm::cl::opt<std::string> StandardHeader(
-    "header",
-    llvm::cl::desc("The standard header file which is to be generated."),
-    llvm::cl::value_desc("<header file>"));
-static llvm::cl::list<std::string> EntrypointNamesOption(
-    "e", llvm::cl::value_desc("<list of entrypoints>"),
-    llvm::cl::desc(
-        "Each --e is one entrypoint (generated from entrypoints.txt)"),
-    llvm::cl::OneOrMore);
-static llvm::cl::list<std::string> ReplacementValues(
-    "args", llvm::cl::desc("Command separated <argument name>=<value> pairs."),
-    llvm::cl::value_desc("<name=value>[,name=value]"));
-static llvm::cl::opt<bool> ExportDecls(
-    "export-decls",
-    llvm::cl::desc("Output a new header containing only the entrypoints."));
-
-static void
-ParseArgValuePairs(std::unordered_map<std::string, std::string> &Map) {
-  for (std::string &R : ReplacementValues) {
-    auto Pair = llvm::StringRef(R).split('=');
-    Map[std::string(Pair.first)] = std::string(Pair.second);
-  }
-}
-
-static bool HeaderGeneratorMain(llvm::raw_ostream &OS,
-                                const llvm::RecordKeeper &Records) {
-  std::unordered_map<std::string, std::string> ArgMap;
-  ParseArgValuePairs(ArgMap);
-  llvm_libc::Generator G(HeaderDefFile, EntrypointNamesOption, StandardHeader,
-                         ArgMap);
-  if (ExportDecls)
-    G.generateDecls(OS, Records);
-  else
-    G.generate(OS, Records);
-
-  return false;
-}
-
-int main(int argc, char *argv[]) {
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], &HeaderGeneratorMain);
-}
diff --git a/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt b/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt
deleted file mode 100644
index 9e25c21c6b359..0000000000000
--- a/libc/utils/HdrGen/PrototypeTestGen/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_tablegen(libc-prototype-testgen LLVM_LIBC
-  PrototypeTestGen.cpp
-)
-target_link_libraries(libc-prototype-testgen PRIVATE LibcTableGenUtil)
-target_include_directories(libc-prototype-testgen PRIVATE ${LIBC_SOURCE_DIR})
diff --git a/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp b/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp
deleted file mode 100644
index 7acaf75c4c1c8..0000000000000
--- a/libc/utils/HdrGen/PrototypeTestGen/PrototypeTestGen.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- PrototypeTestGen.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "utils/LibcTableGenUtil/APIIndexer.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/TableGen/Main.h"
-#include "llvm/TableGen/Record.h"
-
-namespace {
-
-llvm::cl::list<std::string>
-    EntrypointNamesOption("e", llvm::cl::desc("<list of entrypoints>"),
-                          llvm::cl::OneOrMore);
-
-} // anonymous namespace
-
-bool TestGeneratorMain(llvm::raw_ostream &OS,
-                       const llvm::RecordKeeper &records) {
-  OS << "#include \"src/__support/CPP/type_traits.h\"\n";
-  llvm_libc::APIIndexer G(records);
-  std::unordered_set<std::string> headerFileSet;
-  for (const auto &entrypoint : EntrypointNamesOption) {
-    if (entrypoint == "errno")
-      continue;
-    auto match = G.FunctionToHeaderMap.find(entrypoint);
-    if (match == G.FunctionToHeaderMap.end()) {
-      auto objectMatch = G.ObjectToHeaderMap.find(entrypoint);
-      if (objectMatch != G.ObjectToHeaderMap.end()) {
-        headerFileSet.insert(objectMatch->second);
-        continue;
-      }
-
-      llvm::errs() << "ERROR: entrypoint '" << entrypoint
-                   << "' could not be found in spec in any public header\n";
-      return true;
-    }
-    headerFileSet.insert(match->second);
-  }
-  for (const auto &header : headerFileSet)
-    OS << "#include <" << header << ">\n";
-
-  OS << '\n';
-
-  OS << "extern \"C\" int main() {\n";
-  for (const auto &entrypoint : EntrypointNamesOption) {
-    if (entrypoint == "errno")
-      continue;
-    auto match = G.FunctionSpecMap.find(entrypoint);
-    if (match == G.FunctionSpecMap.end()) {
-      auto objectMatch = G.ObjectSpecMap.find(entrypoint);
-      if (objectMatch != G.ObjectSpecMap.end()) {
-        auto entrypointPtr = entrypoint + "_ptr";
-        llvm::Record *objectSpec = G.ObjectSpecMap[entrypoint];
-        auto objectType = objectSpec->getValueAsString("Type");
-        // We just make sure that the global object is present.
-        OS << "  " << objectType << " *" << entrypointPtr << " = &"
-           << entrypoint << ";\n";
-        OS << "  ++" << entrypointPtr << ";\n"; // To avoid unused var warning.
-        continue;
-      }
-      llvm::errs() << "ERROR: entrypoint '" << entrypoint
-                   << "' could not be found in spec in any public header\n";
-      return true;
-    }
-    llvm::Record *functionSpec = match->second;
-    llvm::Record *retValSpec = functionSpec->getValueAsDef("Return");
-    std::string returnType =
-        G.getTypeAsString(retValSpec->getValueAsDef("ReturnType"));
-    // _Noreturn is an indication for the compiler that a function
-    // doesn't return, and isn't a type understood by c++ templates.
-    if (llvm::StringRef(returnType).contains("_Noreturn"))
-      returnType = "void";
-
-    OS << "  static_assert(LIBC_NAMESPACE::cpp::is_same_v<" << returnType
-       << '(';
-    auto args = functionSpec->getValueAsListOfDefs("Args");
-    for (size_t i = 0, size = args.size(); i < size; ++i) {
-      llvm::Record *argType = args[i]->getValueAsDef("ArgType");
-      OS << G.getTypeAsString(argType);
-      if (i < size - 1)
-        OS << ", ";
-    }
-    OS << ") __NOEXCEPT, decltype(" << entrypoint << ")>, ";
-    OS << '"' << entrypoint
-       << " prototype in TableGen does not match public header" << '"';
-    OS << ");\n";
-  }
-
-  OS << '\n';
-  OS << "  return 0;\n";
-  OS << "}\n\n";
-
-  return false;
-}
-
-int main(int argc, char *argv[]) {
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], TestGeneratorMain);
-}
diff --git a/libc/utils/HdrGen/PublicAPICommand.cpp b/libc/utils/HdrGen/PublicAPICommand.cpp
deleted file mode 100644
index 5c46c82a23853..0000000000000
--- a/libc/utils/HdrGen/PublicAPICommand.cpp
+++ /dev/null
@@ -1,331 +0,0 @@
-//===-- Implementation of PublicAPICommand --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "PublicAPICommand.h"
-
-#include "utils/LibcTableGenUtil/APIIndexer.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/TableGen/Record.h"
-#include <algorithm>
-#include <vector>
-
-// Text blocks for macro definitions and type decls can be indented to
-// suit the surrounding tablegen listing. We need to dedent such blocks
-// before writing them out.
-static void dedentAndWrite(llvm::StringRef Text, llvm::raw_ostream &OS) {
-  llvm::SmallVector<llvm::StringRef, 10> Lines;
-  llvm::SplitString(Text, Lines, "\n");
-  size_t shortest_indent = 1024;
-  for (llvm::StringRef L : Lines) {
-    llvm::StringRef Indent = L.take_while([](char c) { return c == ' '; });
-    size_t IndentSize = Indent.size();
-    if (Indent.size() == L.size()) {
-      // Line is all spaces so no point noting the indent.
-      continue;
-    }
-    if (IndentSize < shortest_indent)
-      shortest_indent = IndentSize;
-  }
-  for (llvm::StringRef L : Lines) {
-    if (L.size() >= shortest_indent)
-      OS << L.drop_front(shortest_indent) << '\n';
-  }
-}
-
-static std::string getTypeHdrName(const std::string &Name) {
-  llvm::SmallVector<llvm::StringRef> Parts;
-  llvm::SplitString(llvm::StringRef(Name), Parts);
-  return llvm::join(Parts.begin(), Parts.end(), "_");
-}
-
-namespace llvm_libc {
-
-static bool isAsciiStart(char C) {
-  return (C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') || C == '_';
-}
-
-static bool isAsciiContinue(char C) {
-  return isAsciiStart(C) || (C >= '0' && C <= '9');
-}
-
-static bool isAsciiIdentifier(llvm::StringRef S) {
-  if (S.empty())
-    return false;
-  if (!isAsciiStart(S[0]))
-    return false;
-  for (char C : S.drop_front())
-    if (!isAsciiContinue(C))
-      return false;
-  return true;
-}
-
-static AttributeStyle getAttributeStyle(const llvm::Record *Instance) {
-  llvm::StringRef Style = Instance->getValueAsString("Style");
-  return llvm::StringSwitch<AttributeStyle>(Style)
-      .Case("cxx11", AttributeStyle::Cxx11)
-      .Case("gnu", AttributeStyle::Gnu)
-      .Case("declspec", AttributeStyle::Declspec)
-      .Default(AttributeStyle::Gnu);
-}
-
-static AttributeNamespace getAttributeNamespace(const llvm::Record *Instance) {
-  llvm::StringRef Namespace = Instance->getValueAsString("Namespace");
-  return llvm::StringSwitch<AttributeNamespace>(Namespace)
-      .Case("clang", AttributeNamespace::Clang)
-      .Case("gnu", AttributeNamespace::Gnu)
-      .Default(AttributeNamespace::None);
-}
-
-using AttributeMap = llvm::DenseMap<llvm::StringRef, const llvm::Record *>;
-
-template <class SpecMap, class FuncList>
-static AttributeMap collectAttributeMacros(const SpecMap &Spec,
-                                           const FuncList &Funcs) {
-  llvm::DenseMap<llvm::StringRef, const llvm::Record *> MacroAttr;
-  for (const auto &Name : Funcs) {
-    auto Iter = Spec.find(Name);
-    if (Iter == Spec.end())
-      continue;
-
-    const llvm::Record *FunctionSpec = Iter->second;
-    for (const llvm::Record *Attr :
-         FunctionSpec->getValueAsListOfDefs("Attributes"))
-      MacroAttr[Attr->getValueAsString("Macro")] = Attr;
-  }
-  return MacroAttr;
-}
-
-static void emitAttributeMacroDecls(const AttributeMap &MacroAttr,
-                                    llvm::raw_ostream &OS) {
-  for (auto &[Macro, Attr] : MacroAttr) {
-    std::vector<const llvm::Record *> Instances =
-        Attr->getValueAsListOfDefs("Instances");
-    llvm::SmallVector<std::pair<AttributeStyle, const llvm::Record *>> Styles;
-    std::transform(Instances.begin(), Instances.end(),
-                   std::back_inserter(Styles),
-                   [&](const llvm::Record *Instance)
-                       -> std::pair<AttributeStyle, const llvm::Record *> {
-                     auto Style = getAttributeStyle(Instance);
-                     return {Style, Instance};
-                   });
-    // 1. If __cplusplus is defined and cxx11 style is provided, define the
-    // macro using cxx11 version with the following priority:
-    //    1a. If there is no namespace (so the macro is supposed to be
-    //        compiler-independent), use this version first. This macro will be
-    //        tested via __has_cpp_attribute.
-    //    1b. If the attribute is a clang attribute, check for __clang__.
-    //    1c. If the attribute is a gnu attribute, check for __GNUC__.
-    // 2. Otherwise, if __GNUC__ is defined and gnu style is provided,
-    //    define the macro using gnu version;
-    // 3. Otherwise, if _MSC_VER is defined and __declspec is provided, define
-    //    the macro using __declspec version;
-    // 4. Fallback to empty macro.
-    std::sort(Styles.begin(), Styles.end(), [&](auto &a, auto &b) {
-      if (a.first == AttributeStyle::Cxx11 && b.first == AttributeStyle::Cxx11)
-        return getAttributeNamespace(a.second) <
-               getAttributeNamespace(b.second);
-      return a.first < b.first;
-    });
-    for (auto &[Style, Instance] : Styles) {
-      llvm::StringRef Attr = Instance->getValueAsString("Attr");
-      if (Style == AttributeStyle::Cxx11) {
-        OS << "#if !defined(" << Macro << ") && defined(__cplusplus)";
-        AttributeNamespace Namespace = getAttributeNamespace(Instance);
-        if (Namespace == AttributeNamespace::Clang)
-          OS << " && defined(__clang__)\n";
-        else if (Namespace == AttributeNamespace::Gnu)
-          OS << " && defined(__GNUC__)\n";
-        else
-          OS << '\n';
-        if (isAsciiIdentifier(Attr) && Namespace != AttributeNamespace::None)
-          OS << "#if __has_attribute(" << Attr << ")\n";
-        else
-          OS << "#if __has_cpp_attribute(" << Attr << ")\n";
-        OS << "#define " << Macro << " [[";
-        if (Namespace == AttributeNamespace::Clang)
-          OS << "clang::";
-        else if (Namespace == AttributeNamespace::Gnu)
-          OS << "gnu::";
-        OS << Attr << "]]\n";
-        if (isAsciiIdentifier(Attr))
-          OS << "#endif\n";
-        OS << "#endif\n";
-      }
-      if (Style == AttributeStyle::Gnu) {
-        OS << "#if !defined(" << Macro << ") && defined(__GNUC__)\n";
-        if (isAsciiIdentifier(Attr))
-          OS << "#if __has_attribute(" << Attr << ")\n";
-        OS << "#define " << Macro << " __attribute__((";
-        OS << Attr << "))\n";
-        if (isAsciiIdentifier(Attr))
-          OS << "#endif\n";
-        OS << "#endif\n";
-      }
-      if (Style == AttributeStyle::Declspec) {
-        OS << "#if !defined(" << Macro << ") && defined(_MSC_VER)\n";
-        OS << "#define " << Macro << " __declspec(";
-        OS << Attr << ")\n";
-        OS << "#endif\n";
-      }
-    }
-    OS << "#if !defined(" << Macro << ")\n";
-    OS << "#define " << Macro << '\n';
-    OS << "#endif\n";
-  }
-
-  if (!MacroAttr.empty())
-    OS << '\n';
-}
-
-static void emitAttributeMacroForFunction(const llvm::Record *FunctionSpec,
-                                          llvm::raw_ostream &OS) {
-  std::vector<const llvm::Record *> Attributes =
-      FunctionSpec->getValueAsListOfDefs("Attributes");
-  llvm::interleave(
-      Attributes.begin(), Attributes.end(),
-      [&](const llvm::Record *Attr) { OS << Attr->getValueAsString("Macro"); },
-      [&]() { OS << ' '; });
-  if (!Attributes.empty())
-    OS << ' ';
-}
-
-static void emitUndefsForAttributeMacros(const AttributeMap &MacroAttr,
-                                         llvm::raw_ostream &OS) {
-  if (!MacroAttr.empty())
-    OS << '\n';
-  for (auto &[Macro, Attr] : MacroAttr)
-    OS << "#undef " << Macro << '\n';
-}
-
-static void writeAPIFromIndex(APIIndexer &G,
-                              std::vector<std::string> EntrypointNameList,
-                              llvm::raw_ostream &OS) {
-  for (auto &Pair : G.MacroDefsMap) {
-    const std::string &Name = Pair.first;
-    if (!G.MacroSpecMap.count(Name))
-      llvm::PrintFatalError(Name + " not found in any standard spec.\n");
-
-    const llvm::Record *MacroDef = Pair.second;
-    dedentAndWrite(MacroDef->getValueAsString("Defn"), OS);
-
-    OS << '\n';
-  }
-
-  for (auto &TypeName : G.RequiredTypes) {
-    if (!G.TypeSpecMap.count(TypeName))
-      llvm::PrintFatalError(TypeName + " not found in any standard spec.\n");
-    OS << "#include <llvm-libc-types/" << getTypeHdrName(TypeName) << ".h>\n";
-  }
-  OS << '\n';
-
-  if (G.Enumerations.size() != 0)
-    OS << "enum {" << '\n';
-  for (const auto &Name : G.Enumerations) {
-    if (!G.EnumerationSpecMap.count(Name))
-      llvm::PrintFatalError(
-          Name + " is not listed as an enumeration in any standard spec.\n");
-
-    const llvm::Record *EnumerationSpec = G.EnumerationSpecMap[Name];
-    OS << "  " << EnumerationSpec->getValueAsString("Name");
-    auto Value = EnumerationSpec->getValueAsString("Value");
-    if (Value == "__default__") {
-      OS << ",\n";
-    } else {
-      OS << " = " << Value << ",\n";
-    }
-  }
-  if (G.Enumerations.size() != 0)
-    OS << "};\n\n";
-
-  // Collect and declare macros for attributes
-  AttributeMap MacroAttr =
-      collectAttributeMacros(G.FunctionSpecMap, EntrypointNameList);
-  emitAttributeMacroDecls(MacroAttr, OS);
-
-  OS << "__BEGIN_C_DECLS\n\n";
-  for (auto &Name : EntrypointNameList) {
-    auto Iter = G.FunctionSpecMap.find(Name);
-
-    // Functions that aren't in this header file are skipped as
-    // opposed to erroring out because the list of functions being
-    // iterated over is the complete list of functions with
-    // entrypoints. Thus this is filtering out the functions that
-    // don't go to this header file, whereas the other, similar
-    // conditionals above are more of a sanity check.
-    if (Iter == G.FunctionSpecMap.end())
-      continue;
-
-    const llvm::Record *FunctionSpec = Iter->second;
-    const llvm::Record *RetValSpec = FunctionSpec->getValueAsDef("Return");
-    const llvm::Record *ReturnType = RetValSpec->getValueAsDef("ReturnType");
-
-    // TODO: https://github.com/llvm/llvm-project/issues/81208
-    //   Ideally, we should group functions based on their guarding macros.
-    bool Guarded =
-        (FunctionSpec->getType()->getAsString() == "GuardedFunctionSpec");
-
-    if (Guarded)
-      OS << "#ifdef " << FunctionSpec->getValueAsString("Guard") << "\n";
-
-    // Emit attribute macros for the function. Space is automatically added.
-    emitAttributeMacroForFunction(FunctionSpec, OS);
-    OS << G.getTypeAsString(ReturnType) << " " << Name << "(";
-
-    auto ArgsList = FunctionSpec->getValueAsListOfDefs("Args");
-    for (size_t i = 0; i < ArgsList.size(); ++i) {
-      const llvm::Record *ArgType = ArgsList[i]->getValueAsDef("ArgType");
-      OS << G.getTypeAsString(ArgType);
-      if (i < ArgsList.size() - 1)
-        OS << ", ";
-    }
-
-    OS << ") __NOEXCEPT;\n";
-
-    if (Guarded)
-      OS << "#endif // " << FunctionSpec->getValueAsString("Guard") << "\n";
-
-    OS << "\n";
-  }
-
-  // Make another pass over entrypoints to emit object declarations.
-  for (const auto &Name : EntrypointNameList) {
-    auto Iter = G.ObjectSpecMap.find(Name);
-    if (Iter == G.ObjectSpecMap.end())
-      continue;
-    const llvm::Record *ObjectSpec = Iter->second;
-    auto Type = ObjectSpec->getValueAsString("Type");
-    OS << "extern " << Type << " " << Name << ";\n";
-  }
-  OS << "__END_C_DECLS\n";
-
-  // Undef file-level attribute macros.
-  emitUndefsForAttributeMacros(MacroAttr, OS);
-}
-
-void writePublicAPI(llvm::raw_ostream &OS, const llvm::RecordKeeper &Records) {}
-
-const char PublicAPICommand::Name[] = "public_api";
-
-void PublicAPICommand::run(llvm::raw_ostream &OS, const ArgVector &Args,
-                           llvm::StringRef StdHeader,
-                           const llvm::RecordKeeper &Records,
-                           const Command::ErrorReporter &Reporter) const {
-  if (Args.size() != 0)
-    Reporter.printFatalError("public_api command does not take any arguments.");
-
-  APIIndexer G(StdHeader, Records);
-  writeAPIFromIndex(G, EntrypointNameList, OS);
-}
-
-} // namespace llvm_libc
diff --git a/libc/utils/HdrGen/PublicAPICommand.h b/libc/utils/HdrGen/PublicAPICommand.h
deleted file mode 100644
index 49078f4857f90..0000000000000
--- a/libc/utils/HdrGen/PublicAPICommand.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- Implementation of PublicAPICommand ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UTILS_HDRGEN_PUBLICAPICOMMAND_H
-#define LLVM_LIBC_UTILS_HDRGEN_PUBLICAPICOMMAND_H
-
-#include "Command.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
-
-namespace llvm {
-
-class raw_ostream;
-class Record;
-class RecordKeeper;
-
-} // namespace llvm
-
-namespace llvm_libc {
-
-enum class AttributeStyle { Cxx11 = 0, Gnu = 1, Declspec = 2 };
-enum class AttributeNamespace { None = 0, Clang = 1, Gnu = 2 };
-
-class PublicAPICommand : public Command {
-private:
-  const std::vector<std::string> &EntrypointNameList;
-
-public:
-  static const char Name[];
-
-  PublicAPICommand(const std::vector<std::string> &EntrypointNames)
-      : EntrypointNameList(EntrypointNames) {}
-
-  void run(llvm::raw_ostream &OS, const ArgVector &Args,
-           llvm::StringRef StdHeader, const llvm::RecordKeeper &Records,
-           const Command::ErrorReporter &Reporter) const override;
-};
-
-} // namespace llvm_libc
-
-#endif // LLVM_LIBC_UTILS_HDRGEN_PUBLICAPICOMMAND_H
diff --git a/libc/utils/HdrGen/README.md b/libc/utils/HdrGen/README.md
deleted file mode 100644
index a61cf3bacbb0e..0000000000000
--- a/libc/utils/HdrGen/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# The LLVM libc header generation system
-
-LLVM libc uses a header generation scheme to generate public as well as internal
-header files. This directory contains the implementation of the header generator
-which drives this header generation scheme.
diff --git a/libc/utils/LibcTableGenUtil/APIIndexer.cpp b/libc/utils/LibcTableGenUtil/APIIndexer.cpp
deleted file mode 100644
index cf66d0a7aef6e..0000000000000
--- a/libc/utils/LibcTableGenUtil/APIIndexer.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//===-- Implementation of APIIndexer class --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "APIIndexer.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Record.h"
-
-namespace llvm_libc {
-
-static const char NamedTypeClassName[] = "NamedType";
-static const char PtrTypeClassName[] = "PtrType";
-static const char RestrictedPtrTypeClassName[] = "RestrictedPtrType";
-static const char ConstTypeClassName[] = "ConstType";
-static const char StructTypeClassName[] = "Struct";
-
-static const char StandardSpecClassName[] = "StandardSpec";
-static const char PublicAPIClassName[] = "PublicAPI";
-
-static bool isa(const llvm::Record *Def, const llvm::Record *TypeClass) {
-  const llvm::RecordRecTy *RecordType = Def->getType();
-  llvm::ArrayRef<const llvm::Record *> Classes = RecordType->getClasses();
-  // We want exact types. That is, we don't want the classes listed in
-  // spec.td to be subclassed. Hence, we do not want the record |Def|
-  // to be of more than one class type..
-  if (Classes.size() != 1)
-    return false;
-  return Classes[0] == TypeClass;
-}
-
-bool APIIndexer::isaNamedType(const llvm::Record *Def) {
-  return isa(Def, NamedTypeClass);
-}
-
-bool APIIndexer::isaStructType(const llvm::Record *Def) {
-  return isa(Def, StructClass);
-}
-
-bool APIIndexer::isaPtrType(const llvm::Record *Def) {
-  return isa(Def, PtrTypeClass);
-}
-
-bool APIIndexer::isaConstType(const llvm::Record *Def) {
-  return isa(Def, ConstTypeClass);
-}
-
-bool APIIndexer::isaRestrictedPtrType(const llvm::Record *Def) {
-  return isa(Def, RestrictedPtrTypeClass);
-}
-
-bool APIIndexer::isaStandardSpec(const llvm::Record *Def) {
-  return isa(Def, StandardSpecClass);
-}
-
-bool APIIndexer::isaPublicAPI(const llvm::Record *Def) {
-  return isa(Def, PublicAPIClass);
-}
-
-std::string APIIndexer::getTypeAsString(const llvm::Record *TypeRecord) {
-  if (isaNamedType(TypeRecord) || isaStructType(TypeRecord)) {
-    return std::string(TypeRecord->getValueAsString("Name"));
-  } else if (isaPtrType(TypeRecord)) {
-    return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) + " *";
-  } else if (isaConstType(TypeRecord)) {
-    return std::string("const ") +
-           getTypeAsString(TypeRecord->getValueAsDef("UnqualifiedType"));
-  } else if (isaRestrictedPtrType(TypeRecord)) {
-    return getTypeAsString(TypeRecord->getValueAsDef("PointeeType")) +
-           " *__restrict";
-  } else {
-    llvm::PrintFatalError(TypeRecord->getLoc(), "Invalid type.\n");
-  }
-}
-
-void APIIndexer::indexStandardSpecDef(const llvm::Record *StandardSpec) {
-  auto HeaderSpecList = StandardSpec->getValueAsListOfDefs("Headers");
-  for (const llvm::Record *HeaderSpec : HeaderSpecList) {
-    llvm::StringRef Header = HeaderSpec->getValueAsString("Name");
-    if (!StdHeader.has_value() || Header == StdHeader) {
-      PublicHeaders.emplace(Header);
-      auto MacroSpecList = HeaderSpec->getValueAsListOfDefs("Macros");
-      // TODO: Trigger a fatal error on duplicate specs.
-      for (const llvm::Record *MacroSpec : MacroSpecList)
-        MacroSpecMap[std::string(MacroSpec->getValueAsString("Name"))] =
-            MacroSpec;
-
-      auto TypeSpecList = HeaderSpec->getValueAsListOfDefs("Types");
-      for (const llvm::Record *TypeSpec : TypeSpecList)
-        TypeSpecMap[std::string(TypeSpec->getValueAsString("Name"))] = TypeSpec;
-
-      auto FunctionSpecList = HeaderSpec->getValueAsListOfDefs("Functions");
-      for (const llvm::Record *FunctionSpec : FunctionSpecList) {
-        auto FunctionName = std::string(FunctionSpec->getValueAsString("Name"));
-        FunctionSpecMap[FunctionName] = FunctionSpec;
-        FunctionToHeaderMap[FunctionName] = std::string(Header);
-      }
-
-      auto EnumerationSpecList =
-          HeaderSpec->getValueAsListOfDefs("Enumerations");
-      for (const llvm::Record *EnumerationSpec : EnumerationSpecList) {
-        EnumerationSpecMap[std::string(
-            EnumerationSpec->getValueAsString("Name"))] = EnumerationSpec;
-      }
-
-      auto ObjectSpecList = HeaderSpec->getValueAsListOfDefs("Objects");
-      for (const llvm::Record *ObjectSpec : ObjectSpecList) {
-        auto ObjectName = std::string(ObjectSpec->getValueAsString("Name"));
-        ObjectSpecMap[ObjectName] = ObjectSpec;
-        ObjectToHeaderMap[ObjectName] = std::string(Header);
-      }
-    }
-  }
-}
-
-void APIIndexer::indexPublicAPIDef(const llvm::Record *PublicAPI) {
-  // While indexing the public API, we do not check if any of the entities
-  // requested is from an included standard. Such a check is done while
-  // generating the API.
-  auto MacroDefList = PublicAPI->getValueAsListOfDefs("Macros");
-  for (const llvm::Record *MacroDef : MacroDefList)
-    MacroDefsMap[std::string(MacroDef->getValueAsString("Name"))] = MacroDef;
-
-  auto TypeList = PublicAPI->getValueAsListOfStrings("Types");
-  for (llvm::StringRef TypeName : TypeList)
-    RequiredTypes.insert(std::string(TypeName));
-
-  auto StructList = PublicAPI->getValueAsListOfStrings("Structs");
-  for (llvm::StringRef StructName : StructList)
-    Structs.insert(std::string(StructName));
-
-  auto FunctionList = PublicAPI->getValueAsListOfStrings("Functions");
-  for (llvm::StringRef FunctionName : FunctionList)
-    Functions.insert(std::string(FunctionName));
-
-  auto EnumerationList = PublicAPI->getValueAsListOfStrings("Enumerations");
-  for (llvm::StringRef EnumerationName : EnumerationList)
-    Enumerations.insert(std::string(EnumerationName));
-
-  auto ObjectList = PublicAPI->getValueAsListOfStrings("Objects");
-  for (llvm::StringRef ObjectName : ObjectList)
-    Objects.insert(std::string(ObjectName));
-}
-
-void APIIndexer::index(const llvm::RecordKeeper &Records) {
-  NamedTypeClass = Records.getClass(NamedTypeClassName);
-  PtrTypeClass = Records.getClass(PtrTypeClassName);
-  RestrictedPtrTypeClass = Records.getClass(RestrictedPtrTypeClassName);
-  StructClass = Records.getClass(StructTypeClassName);
-  ConstTypeClass = Records.getClass(ConstTypeClassName);
-  StandardSpecClass = Records.getClass(StandardSpecClassName);
-  PublicAPIClass = Records.getClass(PublicAPIClassName);
-
-  const auto &DefsMap = Records.getDefs();
-  for (auto &Pair : DefsMap) {
-    const llvm::Record *Def = Pair.second.get();
-    if (isaStandardSpec(Def))
-      indexStandardSpecDef(Def);
-    if (isaPublicAPI(Def)) {
-      if (!StdHeader.has_value() ||
-          Def->getValueAsString("HeaderName") == StdHeader)
-        indexPublicAPIDef(Def);
-    }
-  }
-}
-
-} // namespace llvm_libc
diff --git a/libc/utils/LibcTableGenUtil/APIIndexer.h b/libc/utils/LibcTableGenUtil/APIIndexer.h
deleted file mode 100644
index b8bca15ba131c..0000000000000
--- a/libc/utils/LibcTableGenUtil/APIIndexer.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//===-- A class to index libc API listed in tablegen files ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H
-#define LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/TableGen/Record.h"
-
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace llvm_libc {
-
-class APIIndexer {
-private:
-  std::optional<llvm::StringRef> StdHeader;
-
-  // TableGen classes in spec.td.
-  const llvm::Record *NamedTypeClass;
-  const llvm::Record *PtrTypeClass;
-  const llvm::Record *RestrictedPtrTypeClass;
-  const llvm::Record *ConstTypeClass;
-  const llvm::Record *StructClass;
-  const llvm::Record *StandardSpecClass;
-  const llvm::Record *PublicAPIClass;
-
-  bool isaNamedType(const llvm::Record *Def);
-  bool isaStructType(const llvm::Record *Def);
-  bool isaPtrType(const llvm::Record *Def);
-  bool isaConstType(const llvm::Record *Def);
-  bool isaRestrictedPtrType(const llvm::Record *Def);
-  bool isaStandardSpec(const llvm::Record *Def);
-  bool isaPublicAPI(const llvm::Record *Def);
-
-  void indexStandardSpecDef(const llvm::Record *StandardSpec);
-  void indexPublicAPIDef(const llvm::Record *PublicAPI);
-  void index(const llvm::RecordKeeper &Records);
-
-public:
-  using NameToRecordMapping =
-      std::unordered_map<std::string, const llvm::Record *>;
-  using NameSet = std::unordered_set<std::string>;
-
-  // This indexes all headers, not just a specified one.
-  explicit APIIndexer(const llvm::RecordKeeper &Records)
-      : StdHeader(std::nullopt) {
-    index(Records);
-  }
-
-  APIIndexer(llvm::StringRef Header, const llvm::RecordKeeper &Records)
-      : StdHeader(Header) {
-    index(Records);
-  }
-
-  // Mapping from names to records defining them.
-  NameToRecordMapping MacroSpecMap;
-  NameToRecordMapping TypeSpecMap;
-  NameToRecordMapping EnumerationSpecMap;
-  NameToRecordMapping FunctionSpecMap;
-  NameToRecordMapping MacroDefsMap;
-  NameToRecordMapping ObjectSpecMap;
-
-  std::unordered_map<std::string, std::string> FunctionToHeaderMap;
-  std::unordered_map<std::string, std::string> ObjectToHeaderMap;
-
-  NameSet RequiredTypes;
-  NameSet Structs;
-  NameSet Enumerations;
-  NameSet Functions;
-  NameSet Objects;
-  NameSet PublicHeaders;
-
-  std::string getTypeAsString(const llvm::Record *TypeRecord);
-};
-
-} // namespace llvm_libc
-
-#endif // LLVM_LIBC_UTILS_LIBC_TABLE_GEN_UTILS_API_INDEXER_H
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
deleted file mode 100644
index 9421383394a35..0000000000000
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if (NOT LLVM_LINK_LLVM_DYLIB)
-  set(flags "DISABLE_LLVM_LINK_LLVM_DYLIB;LINK_COMPONENTS;Support;TableGen")
-else()
-  set(flags "LINK_COMPONENTS;TableGen")
-endif()
-add_llvm_library(
-  LibcTableGenUtil
-  APIIndexer.cpp
-  APIIndexer.h
-  ${flags}
-)
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
-target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index ae0475693f22b..d83719c8733d7 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -16,7 +16,6 @@
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
-#include <__functional/operations.h>
 #include <__memory/addressof.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_floating_point.h>
@@ -376,7 +375,8 @@ struct atomic<_Tp> : __atomic_base<_Tp> {
     auto __builtin_op = [](auto __a, auto __builtin_operand, auto __order) {
       return std::__cxx_atomic_fetch_add(__a, __builtin_operand, __order);
     };
-    return __rmw_op(std::forward<_This>(__self), __operand, __m, std::plus<>{}, __builtin_op);
+    auto __plus = [](auto __a, auto __b) { return __a + __b; };
+    return __rmw_op(std::forward<_This>(__self), __operand, __m, __plus, __builtin_op);
   }
 
   template <class _This>
@@ -384,7 +384,8 @@ struct atomic<_Tp> : __atomic_base<_Tp> {
     auto __builtin_op = [](auto __a, auto __builtin_operand, auto __order) {
       return std::__cxx_atomic_fetch_sub(__a, __builtin_operand, __order);
     };
-    return __rmw_op(std::forward<_This>(__self), __operand, __m, std::minus<>{}, __builtin_op);
+    auto __minus = [](auto __a, auto __b) { return __a - __b; };
+    return __rmw_op(std::forward<_This>(__self), __operand, __m, __minus, __builtin_op);
   }
 
 public:
diff --git a/lld/CODE_OWNERS.TXT b/lld/CODE_OWNERS.TXT
deleted file mode 100644
index 44972c0d345a8..0000000000000
--- a/lld/CODE_OWNERS.TXT
+++ /dev/null
@@ -1,26 +0,0 @@
-This file is a list of the people responsible for ensuring that patches for a
-particular part of LLD are reviewed, either by themself or by someone else.
-They are also the gatekeepers for their part of LLD, with the final word on
-what goes in or not.
-
-The list is sorted by surname and formatted to allow easy grepping and
-beautification by scripts.  The fields are: name (N), email (E), web-address
-(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S). Each entry should contain at least the (N), (E) and (D) fields.
-
-
-N: Rui Ueyama
-E: ruiu@google.com
-D: COFF, ELF backends (COFF/* ELF/*)
-
-N: Lang Hames, Nick Kledzik
-E: lhames@gmail.com, kledzik@apple.com
-D: Old Mach-O backend
-
-N: Sam Clegg
-E: sbc@chromium.org
-D: WebAssembly backend (wasm/*)
-
-N: Jez Ng, Greg McGary, Shoaib Meenai
-E: jezng@fb.com, gkm@fb.com, smeenai@fb.com
-D: New Mach-O backend
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 57cb443798cd8..9e6b17e87c9e7 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -9,6 +9,7 @@
 #ifndef LLD_COFF_CONFIG_H
 #define LLD_COFF_CONFIG_H
 
+#include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +28,7 @@ namespace lld::coff {
 using llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
 using llvm::COFF::WindowsSubsystem;
 using llvm::StringRef;
+class COFFLinkerContext;
 class DefinedAbsolute;
 class StringChunk;
 class Symbol;
@@ -332,6 +334,48 @@ struct Configuration {
   BuildIDHash buildIDHash = BuildIDHash::None;
 };
 
+struct COFFSyncStream : SyncStream {
+  COFFLinkerContext &ctx;
+  COFFSyncStream(COFFLinkerContext &ctx, DiagLevel level);
+};
+
+template <typename T>
+std::enable_if_t<!std::is_pointer_v<std::remove_reference_t<T>>,
+                 const COFFSyncStream &>
+operator<<(const COFFSyncStream &s, T &&v) {
+  s.os << std::forward<T>(v);
+  return s;
+}
+
+inline const COFFSyncStream &operator<<(const COFFSyncStream &s,
+                                        const char *v) {
+  s.os << v;
+  return s;
+}
+
+inline const COFFSyncStream &operator<<(const COFFSyncStream &s, Error v) {
+  s.os << llvm::toString(std::move(v));
+  return s;
+}
+
+// Report a log if -verbose is specified.
+COFFSyncStream Log(COFFLinkerContext &ctx);
+
+// Print a message to stdout.
+COFFSyncStream Msg(COFFLinkerContext &ctx);
+
+// Report a warning. Upgraded to an error if /WX is specified.
+COFFSyncStream Warn(COFFLinkerContext &ctx);
+
+// Report an error that will suppress the output file generation.
+COFFSyncStream Err(COFFLinkerContext &ctx);
+
+// Report a fatal error that exits immediately. This should generally be avoided
+// in favor of Err.
+COFFSyncStream Fatal(COFFLinkerContext &ctx);
+
+uint64_t errCount(COFFLinkerContext &ctx);
+
 } // namespace lld::coff
 
 #endif
diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp
index 7689ad163a657..08f61e0d44621 100644
--- a/lld/COFF/DebugTypes.cpp
+++ b/lld/COFF/DebugTypes.cpp
@@ -674,8 +674,8 @@ void TpiSource::mergeTypeRecord(TypeIndex curIndex, CVType ty) {
       funcIdToType.push_back({funcId, funcType});
     } else {
       StringRef fname = file ? file->getName() : "<unknown PDB>";
-      warn("corrupt LF_[M]FUNC_ID record 0x" + utohexstr(curIndex.getIndex()) +
-           " in " + fname);
+      Warn(ctx) << "corrupt LF_[M]FUNC_ID record 0x"
+                << utohexstr(curIndex.getIndex()) << " in " << fname;
     }
   }
 }
@@ -836,7 +836,7 @@ void UseTypeServerSource::remapTpiWithGHashes(GHashState *g) {
 
 void PrecompSource::loadGHashes() {
   if (getDebugH(file)) {
-    warn("ignoring .debug$H section; pch with ghash is not implemented");
+    Warn(ctx) << "ignoring .debug$H section; pch with ghash is not implemented";
   }
 
   uint32_t ghashIdx = 0;
@@ -864,7 +864,7 @@ void PrecompSource::loadGHashes() {
 void UsePrecompSource::loadGHashes() {
   auto e = findPrecompMap(file, precompDependency);
   if (!e) {
-    warn(toString(e.takeError()));
+    Warn(ctx) << e.takeError();
     return;
   }
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index a0bff69c6302a..11e13f20c8042 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -56,11 +56,33 @@
 #include <optional>
 #include <tuple>
 
+using namespace lld;
+using namespace lld::coff;
 using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::COFF;
 using namespace llvm::sys;
 
+COFFSyncStream::COFFSyncStream(COFFLinkerContext &ctx, DiagLevel level)
+    : SyncStream(ctx.e, level), ctx(ctx) {}
+
+COFFSyncStream coff::Log(COFFLinkerContext &ctx) {
+  return {ctx, DiagLevel::Log};
+}
+COFFSyncStream coff::Msg(COFFLinkerContext &ctx) {
+  return {ctx, DiagLevel::Msg};
+}
+COFFSyncStream coff::Warn(COFFLinkerContext &ctx) {
+  return {ctx, DiagLevel::Warn};
+}
+COFFSyncStream coff::Err(COFFLinkerContext &ctx) {
+  return {ctx, DiagLevel::Err};
+}
+COFFSyncStream coff::Fatal(COFFLinkerContext &ctx) {
+  return {ctx, DiagLevel::Fatal};
+}
+uint64_t coff::errCount(COFFLinkerContext &ctx) { return ctx.e.errorCount; }
+
 namespace lld::coff {
 
 bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
@@ -75,7 +97,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
 
   ctx->driver.linkerMain(args);
 
-  return errorCount() == 0;
+  return errCount(*ctx) == 0;
 }
 
 // Parse options of the form "old;new".
@@ -212,7 +234,8 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
     ctx.symtab.addFile(make<PDBInputFile>(ctx, mbref));
     break;
   case file_magic::coff_cl_gl_object:
-    error(filename + ": is not a native COFF file. Recompile without /GL");
+    Err(ctx) << filename
+             << ": is not a native COFF file. Recompile without /GL";
     break;
   case file_magic::pecoff_executable:
     if (ctx.config.mingw) {
@@ -302,7 +325,7 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
 
   obj->parentName = parentName;
   ctx.symtab.addFile(obj);
-  log("Loaded " + toString(obj) + " for " + symName);
+  Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
 void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
@@ -310,9 +333,9 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
                                         StringRef parentName) {
 
   auto reportBufferError = [=](Error &&e, StringRef childName) {
-    fatal("could not get the buffer for the member defining symbol " +
-          toCOFFString(ctx, sym) + ": " + parentName + "(" + childName +
-          "): " + toString(std::move(e)));
+    Fatal(ctx) << "could not get the buffer for the member defining symbol "
+               << &sym << ": " << parentName << "(" << childName
+               << "): " << std::move(e);
   };
 
   if (!c.getParent()->isThin()) {
@@ -361,7 +384,7 @@ void LinkerDriver::parseDirectives(InputFile *file) {
   if (s.empty())
     return;
 
-  log("Directives: " + toString(file) + ": " + s);
+  Log(ctx) << "Directives: " << file << ": " << s;
 
   ArgParser parser(ctx);
   // .drectve is always tokenized using Windows shell rules.
@@ -414,7 +437,7 @@ void LinkerDriver::parseDirectives(InputFile *file) {
       break;
     case OPT_entry:
       if (!arg->getValue()[0])
-        fatal("missing entry point symbol name");
+        Fatal(ctx) << "missing entry point symbol name";
       ctx.config.entry = addUndefined(mangle(arg->getValue()), true);
       break;
     case OPT_failifmismatch:
@@ -779,14 +802,14 @@ StringRef LinkerDriver::findDefaultEntry() {
     if (findUnderscoreMangle("wWinMain")) {
       if (!findUnderscoreMangle("WinMain"))
         return mangle("wWinMainCRTStartup");
-      warn("found both wWinMain and WinMain; using latter");
+      Warn(ctx) << "found both wWinMain and WinMain; using latter";
     }
     return mangle("WinMainCRTStartup");
   }
   if (findUnderscoreMangle("wmain")) {
     if (!findUnderscoreMangle("main"))
       return mangle("wmainCRTStartup");
-    warn("found both wmain and main; using latter");
+    Warn(ctx) << "found both wmain and main; using latter";
   }
   return mangle("mainCRTStartup");
 }
@@ -805,9 +828,9 @@ WindowsSubsystem LinkerDriver::inferSubsystem() {
   bool haveWWinMain = findUnderscoreMangle("wWinMain");
   if (haveMain || haveWMain) {
     if (haveWinMain || haveWWinMain) {
-      warn(std::string("found ") + (haveMain ? "main" : "wmain") + " and " +
-           (haveWinMain ? "WinMain" : "wWinMain") +
-           "; defaulting to /subsystem:console");
+      Warn(ctx) << "found " << (haveMain ? "main" : "wmain") << " and "
+                << (haveWinMain ? "WinMain" : "wWinMain")
+                << "; defaulting to /subsystem:console";
     }
     return IMAGE_SUBSYSTEM_WINDOWS_CUI;
   }
@@ -887,7 +910,8 @@ static std::string createResponseFile(const opt::InputArgList &args,
   return std::string(data);
 }
 
-static unsigned parseDebugTypes(const opt::InputArgList &args) {
+static unsigned parseDebugTypes(COFFLinkerContext &ctx,
+                                const opt::InputArgList &args) {
   unsigned debugTypes = static_cast<unsigned>(DebugType::None);
 
   if (auto *a = args.getLastArg(OPT_debugtype)) {
@@ -902,7 +926,7 @@ static unsigned parseDebugTypes(const opt::InputArgList &args) {
                        .Case("fixup", static_cast<unsigned>(DebugType::Fixup))
                        .Default(0);
       if (v == 0) {
-        warn("/debugtype: unknown option '" + type + "'");
+        Warn(ctx) << "/debugtype: unknown option '" << type << "'";
         continue;
       }
       debugTypes |= v;
@@ -1139,7 +1163,8 @@ void LinkerDriver::parseOrderFile(StringRef arg) {
 
     if (set.count(s) == 0) {
       if (ctx.config.warnMissingOrderSymbol)
-        warn("/order:" + arg + ": missing symbol: " + s + " [LNK4037]");
+        Warn(ctx) << "/order:" << arg << ": missing symbol: " << s
+                  << " [LNK4037]";
     } else
       ctx.config.order[s] = INT_MIN + ctx.config.order.size();
   }
@@ -1166,7 +1191,7 @@ void LinkerDriver::parseCallGraphFile(StringRef path) {
     Symbol *sym = map.lookup(name);
     if (!sym) {
       if (ctx.config.warnMissingOrderSymbol)
-        warn(path + ": no such symbol: " + name);
+        Warn(ctx) << path << ": no such symbol: " << name;
       return nullptr;
     }
 
@@ -1309,8 +1334,8 @@ void LinkerDriver::parsePDBAltPath() {
     else if (var.equals_insensitive("%_ext%"))
       buf.append(binaryExtension);
     else {
-      warn("only %_PDB% and %_EXT% supported in /pdbaltpath:, keeping " + var +
-           " as literal");
+      Warn(ctx) << "only %_PDB% and %_EXT% supported in /pdbaltpath:, keeping "
+                << var << " as literal";
       buf.append(var);
     }
 
@@ -1615,7 +1640,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // because it doesn't start with "/", but we deliberately chose "--" to
   // avoid conflict with /version and for compatibility with clang-cl.
   if (args.hasArg(OPT_dash_dash_version)) {
-    message(getLLDVersion());
+    Msg(ctx) << getLLDVersion();
     return;
   }
 
@@ -1668,7 +1693,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         addLibSearchPaths();
     } else {
       if (args.hasArg(OPT_vctoolsdir, OPT_winsysroot))
-        warn("ignoring /vctoolsdir or /winsysroot flags in MinGW mode");
+        Warn(ctx) << "ignoring /vctoolsdir or /winsysroot flags in MinGW mode";
     }
   }
 
@@ -1729,7 +1754,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     StringRef(str).split(vec, ',');
     for (StringRef s : vec) {
       if (s == "fastlink") {
-        warn("/debug:fastlink unsupported; using /debug:full");
+        Warn(ctx) << "/debug:fastlink unsupported; using /debug:full";
         s = "full";
       }
       if (s == "none") {
@@ -1772,7 +1797,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   config->demangle = args.hasFlag(OPT_demangle, OPT_demangle_no, true);
 
   // Handle /debugtype
-  config->debugTypes = parseDebugTypes(args);
+  config->debugTypes = parseDebugTypes(ctx, args);
 
   // Handle /driver[:uponly|:wdm].
   config->driverUponly = args.hasArg(OPT_driver_uponly) ||
@@ -1809,7 +1834,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
   // Handle /pdbstripped
   if (args.hasArg(OPT_pdbstripped))
-    warn("ignoring /pdbstripped flag, it is not yet supported");
+    Warn(ctx) << "ignoring /pdbstripped flag, it is not yet supported";
 
   // Handle /noentry
   if (args.hasArg(OPT_noentry)) {
@@ -2091,7 +2116,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     if (!isPowerOf2_64(config->align))
       error("/align: not a power of two: " + StringRef(arg->getValue()));
     if (!args.hasArg(OPT_driver))
-      warn("/align specified without /driver; image may not run");
+      Warn(ctx) << "/align specified without /driver; image may not run";
   }
 
   // Handle /aligncomm
@@ -2176,31 +2201,33 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
                    OPT_lld_allow_duplicate_weak_no, config->mingw);
 
   if (args.hasFlag(OPT_inferasanlibs, OPT_inferasanlibs_no, false))
-    warn("ignoring '/inferasanlibs', this flag is not supported");
+    Warn(ctx) << "ignoring '/inferasanlibs', this flag is not supported";
 
   if (config->incremental && args.hasArg(OPT_profile)) {
-    warn("ignoring '/incremental' due to '/profile' specification");
+    Warn(ctx) << "ignoring '/incremental' due to '/profile' specification";
     config->incremental = false;
   }
 
   if (config->incremental && args.hasArg(OPT_order)) {
-    warn("ignoring '/incremental' due to '/order' specification");
+    Warn(ctx) << "ignoring '/incremental' due to '/order' specification";
     config->incremental = false;
   }
 
   if (config->incremental && config->doGC) {
-    warn("ignoring '/incremental' because REF is enabled; use '/opt:noref' to "
-         "disable");
+    Warn(ctx) << "ignoring '/incremental' because REF is enabled; use "
+                 "'/opt:noref' to "
+                 "disable";
     config->incremental = false;
   }
 
   if (config->incremental && config->doICF != ICFLevel::None) {
-    warn("ignoring '/incremental' because ICF is enabled; use '/opt:noicf' to "
-         "disable");
+    Warn(ctx) << "ignoring '/incremental' because ICF is enabled; use "
+                 "'/opt:noicf' to "
+                 "disable";
     config->incremental = false;
   }
 
-  if (errorCount())
+  if (errCount(ctx))
     return;
 
   std::set<sys::fs::UniqueID> wholeArchives;
@@ -2262,7 +2289,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // We should have inferred a machine type by now from the input files, but if
   // not we assume x64.
   if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN) {
-    warn("/machine is not specified. x64 is assumed");
+    Warn(ctx) << "/machine is not specified. x64 is assumed";
     config->machine = AMD64;
     addWinSysRootLibSearchPaths();
   }
@@ -2279,7 +2306,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       stream << "  " << path << "\n";
     }
 
-    message(buffer);
+    Msg(ctx) << buffer;
   }
 
   // Process files specified as /defaultlib. These must be processed after
@@ -2437,8 +2464,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   }
 
   if (config->lldmapFile != "" && config->lldmapFile == config->mapFile) {
-    warn("/lldmap and /map have the same output file '" + config->mapFile +
-         "'.\n>>> ignoring /lldmap");
+    Warn(ctx) << "/lldmap and /map have the same output file '"
+              << config->mapFile << "'.\n>>> ignoring /lldmap";
     config->lldmapFile.clear();
   }
 
@@ -2718,7 +2745,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
 
     Symbol *sym = ctx.symtab.find(name);
     if (!sym) {
-      warn("/aligncomm symbol " + name + " not found");
+      Warn(ctx) << "/aligncomm symbol " << name << " not found";
       continue;
     }
 
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 8a72d74bd27ba..bb6394aca4984 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -172,7 +172,7 @@ void LinkerDriver::parseMerge(StringRef s) {
   if (!inserted) {
     StringRef existing = pair.first->second;
     if (existing != to)
-      warn(s + ": already merged into " + existing);
+      Warn(ctx) << s << ": already merged into " << existing;
   }
 }
 
@@ -741,12 +741,12 @@ void LinkerDriver::fixupExports() {
       continue;
     }
     if (existing->source == e.source) {
-      warn(Twine("duplicate ") + exportSourceName(existing->source) +
-           " option: " + e.name);
+      Warn(ctx) << "duplicate " << exportSourceName(existing->source)
+                << " option: " << e.name;
     } else {
-      warn("duplicate export: " + e.name +
-           Twine(" first seen in " + exportSourceName(existing->source) +
-                 Twine(", now in " + exportSourceName(e.source))));
+      Warn(ctx) << "duplicate export: " << e.name << " first seen in "
+                << exportSourceName(existing->source) << ", now in "
+                << exportSourceName(e.source);
     }
   }
   ctx.config.exports = std::move(v);
@@ -822,7 +822,7 @@ MemoryBufferRef LinkerDriver::convertResToCOFF(ArrayRef<MemoryBufferRef> mbs,
 
   for (const auto &dupeDiag : duplicates)
     if (ctx.config.forceMultipleRes)
-      warn(dupeDiag);
+      Warn(ctx) << dupeDiag;
     else
       error(dupeDiag);
 
@@ -922,7 +922,7 @@ opt::InputArgList ArgParser::parse(ArrayRef<const char *> argv) {
     std::string msg = "Command line:";
     for (const char *s : expandedArgv)
       msg += " " + std::string(s);
-    message(msg);
+    Msg(ctx) << msg;
   }
 
   // Save the command line after response file expansion so we can write it to
@@ -945,14 +945,15 @@ opt::InputArgList ArgParser::parse(ArrayRef<const char *> argv) {
   for (opt::Arg *arg : args.filtered(OPT_UNKNOWN)) {
     std::string nearest;
     if (ctx.optTable.findNearest(arg->getAsString(args), nearest) > 1)
-      warn("ignoring unknown argument '" + arg->getAsString(args) + "'");
+      Warn(ctx) << "ignoring unknown argument '" << arg->getAsString(args)
+                << "'";
     else
-      warn("ignoring unknown argument '" + arg->getAsString(args) +
-           "', did you mean '" + nearest + "'");
+      Warn(ctx) << "ignoring unknown argument '" << arg->getAsString(args)
+                << "', did you mean '" << nearest << "'";
   }
 
   if (args.hasArg(OPT_lib))
-    warn("ignoring /lib since it's not the first argument");
+    Warn(ctx) << "ignoring /lib since it's not the first argument";
 
   return args;
 }
@@ -994,7 +995,7 @@ ParsedDirectives ArgParser::parseDirectives(StringRef s) {
   if (missingCount)
     fatal(Twine(result.args.getArgString(missingIndex)) + ": missing argument");
   for (auto *arg : result.args.filtered(OPT_UNKNOWN))
-    warn("ignoring unknown argument: " + arg->getAsString(result.args));
+    Warn(ctx) << "ignoring unknown argument: " << arg->getAsString(result.args);
   return result;
 }
 
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 6b5efb34b3f3e..65def1f509a4d 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -70,6 +70,11 @@ std::string lld::toString(const coff::InputFile *file) {
       .str();
 }
 
+const COFFSyncStream &coff::operator<<(const COFFSyncStream &s,
+                                       const InputFile *f) {
+  return s << toString(f);
+}
+
 /// Checks that Source is compatible with being a weak alias to Target.
 /// If Source is Undefined and has no weak alias set, makes it a weak
 /// alias to Target.
@@ -201,7 +206,7 @@ void ObjFile::initializeECThunks() {
       case Arm64ECThunkType::GuestExit:
         break;
       default:
-        warn("Ignoring unknown EC thunk type " + Twine(entry->type));
+        Warn(ctx) << "Ignoring unknown EC thunk type " << entry->type;
       }
     }
   }
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 77f7e298166ee..e727d1376e2f2 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -40,6 +40,8 @@ class DWARFCache;
 namespace coff {
 class COFFLinkerContext;
 
+const COFFSyncStream &operator<<(const COFFSyncStream &, const InputFile *);
+
 std::vector<MemoryBufferRef> getArchiveMembers(llvm::object::Archive *file);
 
 using llvm::COFF::IMAGE_FILE_MACHINE_UNKNOWN;
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index c20b54a5d42e4..b4b10ef8913f0 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -1006,11 +1006,10 @@ static void warnUnusable(InputFile *f, Error e, bool shouldWarn) {
     consumeError(std::move(e));
     return;
   }
-  auto msg = "Cannot use debug info for '" + toString(f) + "' [LNK4099]";
+  auto diag = Warn(f->ctx);
+  diag << "Cannot use debug info for '" << f << "' [LNK4099]";
   if (e)
-    warn(msg + "\n>>> failed to load reference " + toString(std::move(e)));
-  else
-    warn(msg);
+    diag << "\n>>> failed to load reference " << std::move(e);
 }
 
 // Allocate memory for a .debug$S / .debug$F section and relocate it.
@@ -1317,7 +1316,7 @@ void PDBLinker::printStats() {
     printLargeInputTypeRecs("IPI", tMerger.ipiCounts, tMerger.getIDTable());
   }
 
-  message(buffer);
+  Msg(ctx) << buffer;
 }
 
 void PDBLinker::addNatvisFiles() {
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 0c6df701284b7..9f41421722286 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -327,7 +327,8 @@ void SymbolTable::loadMinGWSymbols() {
         // If it's lazy or already defined, hook it up as weak alias.
         if (l->isLazy() || isa<Defined>(l)) {
           if (ctx.config.warnStdcallFixup)
-            warn("Resolving " + origName + " by linking to " + newName);
+            Warn(ctx) << "Resolving " << origName << " by linking to "
+                      << newName;
           else
             log("Resolving " + origName + " by linking to " + newName);
           undef->setWeakAlias(l);
@@ -379,9 +380,9 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
         toString(cast<DefinedRegular>(imp)->file));
     impSize = sizeof(DefinedRegular);
   } else {
-    warn("unable to automatically import " + name + " from " + imp->getName() +
-         " from " + toString(cast<DefinedRegular>(imp)->file) +
-         "; unexpected symbol type");
+    Warn(ctx) << "unable to automatically import " << name << " from "
+              << imp->getName() << " from " << cast<DefinedRegular>(imp)->file
+              << "; unexpected symbol type";
     return false;
   }
   sym->replaceKeepingName(imp, impSize);
@@ -412,7 +413,7 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
 /// objFiles and bitcodeFiles (if not nullptr) are used to report where
 /// undefined symbols are referenced.
 static void reportProblemSymbols(
-    const COFFLinkerContext &ctx, const SmallPtrSetImpl<Symbol *> &undefs,
+    COFFLinkerContext &ctx, const SmallPtrSetImpl<Symbol *> &undefs,
     const DenseMap<Symbol *, Symbol *> *localImports, bool needBitcodeFiles) {
   // Return early if there is nothing to report (which should be
   // the common case).
@@ -425,8 +426,9 @@ static void reportProblemSymbols(
                   ctx.config.forceUnresolved);
     if (localImports)
       if (Symbol *imp = localImports->lookup(b))
-        warn("<root>: locally defined symbol imported: " + toString(ctx, *imp) +
-             " (defined in " + toString(imp->getFile()) + ") [LNK4217]");
+        Warn(ctx) << "<root>: locally defined symbol imported: "
+                  << toString(ctx, *imp) << " (defined in "
+                  << toString(imp->getFile()) << ") [LNK4217]";
   }
 
   std::vector<UndefinedDiag> undefDiags;
@@ -447,9 +449,9 @@ static void reportProblemSymbols(
       }
       if (localImports)
         if (Symbol *imp = localImports->lookup(sym))
-          warn(toString(file) +
-               ": locally defined symbol imported: " + toString(ctx, *imp) +
-               " (defined in " + toString(imp->getFile()) + ") [LNK4217]");
+          Warn(ctx) << file << ": locally defined symbol imported: "
+                    << toString(ctx, *imp) << " (defined in " << imp->getFile()
+                    << ") [LNK4217]";
     }
   };
 
@@ -814,7 +816,7 @@ void SymbolTable::reportDuplicate(Symbol *existing, InputFile *newFile,
                           existing->getName());
 
   if (ctx.config.forceMultiple)
-    warn(msg);
+    Warn(ctx) << msg;
   else
     error(msg);
 }
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index f2fa2392ecbbc..383f62afd8e1d 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -53,6 +53,13 @@ std::string toCOFFString(const COFFLinkerContext &ctx,
   return maybeDemangleSymbol(ctx, b.getName());
 }
 
+const COFFSyncStream &
+coff::operator<<(const COFFSyncStream &s,
+                 const llvm::object::Archive::Symbol *sym) {
+  s << maybeDemangleSymbol(s.ctx, sym->getName());
+  return s;
+}
+
 namespace coff {
 
 void Symbol::computeName() {
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 203a542466c68..6fabed9fc8f2b 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -35,6 +35,9 @@ class InputFile;
 class ObjFile;
 class SymbolTable;
 
+const COFFSyncStream &operator<<(const COFFSyncStream &,
+                                 const llvm::object::Archive::Symbol *);
+
 // The base class for real symbol classes.
 class Symbol {
 public:
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index d3e326378ed2d..3ec8e42f97c8e 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1313,7 +1313,7 @@ void Writer::createExportTable() {
     // Allow using a custom built export table from input object files, instead
     // of having the linker synthesize the tables.
     if (ctx.config.hadExplicitExports)
-      warn("literal .edata sections override exports");
+      Warn(ctx) << "literal .edata sections override exports";
   } else if (!ctx.config.exports.empty()) {
     for (Chunk *c : edata.chunks)
       edataSec->addChunk(c);
@@ -1325,7 +1325,7 @@ void Writer::createExportTable() {
   // Warn on exported deleting destructor.
   for (auto e : ctx.config.exports)
     if (e.sym && e.sym->getName().starts_with("??_G"))
-      warn("export of deleting dtor: " + toString(ctx, *e.sym));
+      Warn(ctx) << "export of deleting dtor: " << toString(ctx, *e.sym);
 }
 
 void Writer::removeUnusedSections() {
@@ -1457,9 +1457,10 @@ void Writer::createSymbolAndStringTable() {
     if ((sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0)
       continue;
     if (ctx.config.warnLongSectionNames) {
-      warn("section name " + sec->name +
-           " is longer than 8 characters and will use a non-standard string "
-           "table");
+      Warn(ctx)
+          << "section name " << sec->name
+          << " is longer than 8 characters and will use a non-standard string "
+             "table";
     }
     sec->setStringTableOff(addEntryToStringTable(sec->name));
   }
@@ -2086,8 +2087,8 @@ void Writer::getSymbolsFromSections(ObjFile *file,
     // Validate that the contents look like symbol table indices.
     ArrayRef<uint8_t> data = c->getContents();
     if (data.size() % 4 != 0) {
-      warn("ignoring " + c->getSectionName() +
-           " symbol table index section in object " + toString(file));
+      Warn(ctx) << "ignoring " << c->getSectionName()
+                << " symbol table index section in object " << file;
       continue;
     }
 
@@ -2098,8 +2099,8 @@ void Writer::getSymbolsFromSections(ObjFile *file,
     ArrayRef<Symbol *> objSymbols = file->getSymbols();
     for (uint32_t symIndex : symIndices) {
       if (symIndex >= objSymbols.size()) {
-        warn("ignoring invalid symbol table index in section " +
-             c->getSectionName() + " in object " + toString(file));
+        Warn(ctx) << "ignoring invalid symbol table index in section "
+                  << c->getSectionName() << " in object " << file;
         continue;
       }
       if (Symbol *s = objSymbols[symIndex]) {
@@ -2606,7 +2607,8 @@ void Writer::prepareLoadConfig() {
   auto *b = cast_if_present<DefinedRegular>(sym);
   if (!b) {
     if (ctx.config.guardCF != GuardCFLevel::Off)
-      warn("Control Flow Guard is enabled but '_load_config_used' is missing");
+      Warn(ctx)
+          << "Control Flow Guard is enabled but '_load_config_used' is missing";
     return;
   }
 
@@ -2616,13 +2618,13 @@ void Writer::prepareLoadConfig() {
   uint8_t *symBuf = secBuf + (b->getRVA() - sec->getRVA());
   uint32_t expectedAlign = ctx.config.is64() ? 8 : 4;
   if (b->getChunk()->getAlignment() < expectedAlign)
-    warn("'_load_config_used' is misaligned (expected alignment to be " +
-         Twine(expectedAlign) + " bytes, got " +
-         Twine(b->getChunk()->getAlignment()) + " instead)");
+    Warn(ctx) << "'_load_config_used' is misaligned (expected alignment to be "
+              << expectedAlign << " bytes, got "
+              << b->getChunk()->getAlignment() << " instead)";
   else if (!isAligned(Align(expectedAlign), b->getRVA()))
-    warn("'_load_config_used' is misaligned (RVA is 0x" +
-         Twine::utohexstr(b->getRVA()) + " not aligned to " +
-         Twine(expectedAlign) + " bytes)");
+    Warn(ctx) << "'_load_config_used' is misaligned (RVA is 0x"
+              << Twine::utohexstr(b->getRVA()) << " not aligned to "
+              << expectedAlign << " bytes)";
 
   if (ctx.config.is64())
     prepareLoadConfig(reinterpret_cast<coff_load_configuration64 *>(symBuf));
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 5b5ad482ea127..99fc750486e4b 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -154,9 +154,9 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
   case R_AARCH64_MOVW_UABS_G3:
     return R_ABS;
   case R_AARCH64_AUTH_ABS64:
-    return R_AARCH64_AUTH;
+    return RE_AARCH64_AUTH;
   case R_AARCH64_TLSDESC_ADR_PAGE21:
-    return R_AARCH64_TLSDESC_PAGE;
+    return RE_AARCH64_TLSDESC_PAGE;
   case R_AARCH64_TLSDESC_LD64_LO12:
   case R_AARCH64_TLSDESC_ADD_LO12:
     return R_TLSDESC;
@@ -198,15 +198,15 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
     return R_PC;
   case R_AARCH64_ADR_PREL_PG_HI21:
   case R_AARCH64_ADR_PREL_PG_HI21_NC:
-    return R_AARCH64_PAGE_PC;
+    return RE_AARCH64_PAGE_PC;
   case R_AARCH64_LD64_GOT_LO12_NC:
   case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
     return R_GOT;
   case R_AARCH64_LD64_GOTPAGE_LO15:
-    return R_AARCH64_GOT_PAGE;
+    return RE_AARCH64_GOT_PAGE;
   case R_AARCH64_ADR_GOT_PAGE:
   case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
-    return R_AARCH64_GOT_PAGE_PC;
+    return RE_AARCH64_GOT_PAGE_PC;
   case R_AARCH64_GOTPCREL32:
   case R_AARCH64_GOT_LD_PREL19:
     return R_GOT_PC;
@@ -222,7 +222,7 @@ RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
 RelExpr AArch64::adjustTlsExpr(RelType type, RelExpr expr) const {
   if (expr == R_RELAX_TLS_GD_TO_IE) {
     if (type == R_AARCH64_TLSDESC_ADR_PAGE21)
-      return R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC;
+      return RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC;
     return R_RELAX_TLS_GD_TO_IE_ABS;
   }
   return expr;
@@ -877,7 +877,7 @@ bool AArch64Relaxer::tryRelaxAdrpLdr(const Relocation &adrpRel,
   if (val != llvm::SignExtend64(val, 33))
     return false;
 
-  Relocation adrpSymRel = {R_AARCH64_PAGE_PC, R_AARCH64_ADR_PREL_PG_HI21,
+  Relocation adrpSymRel = {RE_AARCH64_PAGE_PC, R_AARCH64_ADR_PREL_PG_HI21,
                            adrpRel.offset, /*addend=*/0, &sym};
   Relocation addRel = {R_ABS, R_AARCH64_ADD_ABS_LO12_NC, ldrRel.offset,
                        /*addend=*/0, &sym};
@@ -922,21 +922,21 @@ void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
     }
 
     switch (rel.expr) {
-    case R_AARCH64_GOT_PAGE_PC:
+    case RE_AARCH64_GOT_PAGE_PC:
       if (i + 1 < size &&
           relaxer.tryRelaxAdrpLdr(rel, sec.relocs()[i + 1], secAddr, buf)) {
         ++i;
         continue;
       }
       break;
-    case R_AARCH64_PAGE_PC:
+    case RE_AARCH64_PAGE_PC:
       if (i + 1 < size &&
           relaxer.tryRelaxAdrpAdd(rel, sec.relocs()[i + 1], secAddr, buf)) {
         ++i;
         continue;
       }
       break;
-    case R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
+    case RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
     case R_RELAX_TLS_GD_TO_IE_ABS:
       relaxTlsGdToIe(loc, rel, val);
       continue;
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 62685b1e7dede..29a72d35af666 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -136,7 +136,7 @@ RelExpr ARM::getRelExpr(RelType type, const Symbol &s,
     // GOT(S) + A - P
     return R_GOT_PC;
   case R_ARM_SBREL32:
-    return R_ARM_SBREL;
+    return RE_ARM_SBREL;
   case R_ARM_TARGET1:
     return ctx.arg.target1Rel ? R_PC : R_ABS;
   case R_ARM_TARGET2:
@@ -176,14 +176,14 @@ RelExpr ARM::getRelExpr(RelType type, const Symbol &s,
   case R_ARM_THM_ALU_PREL_11_0:
   case R_ARM_THM_PC8:
   case R_ARM_THM_PC12:
-    return R_ARM_PCA;
+    return RE_ARM_PCA;
   case R_ARM_MOVW_BREL_NC:
   case R_ARM_MOVW_BREL:
   case R_ARM_MOVT_BREL:
   case R_ARM_THM_MOVW_BREL_NC:
   case R_ARM_THM_MOVW_BREL:
   case R_ARM_THM_MOVT_BREL:
-    return R_ARM_SBREL;
+    return RE_ARM_SBREL;
   case R_ARM_NONE:
     return R_NONE;
   case R_ARM_TLS_LE32:
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index ebfdbafc9983e..3280c34cb6ed0 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -428,7 +428,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_SUB_ULEB128:
     // The LoongArch add/sub relocs behave like the RISCV counterparts; reuse
     // the RelExpr to avoid code duplication.
-    return R_RISCV_ADD;
+    return RE_RISCV_ADD;
   case R_LARCH_32_PCREL:
   case R_LARCH_64_PCREL:
   case R_LARCH_PCREL20_S2:
@@ -444,17 +444,17 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_TLS_IE_PC_HI20:
   case R_LARCH_TLS_IE64_PC_LO20:
   case R_LARCH_TLS_IE64_PC_HI12:
-    return R_LOONGARCH_GOT_PAGE_PC;
+    return RE_LOONGARCH_GOT_PAGE_PC;
   case R_LARCH_GOT_PC_LO12:
   case R_LARCH_TLS_IE_PC_LO12:
-    return R_LOONGARCH_GOT;
+    return RE_LOONGARCH_GOT;
   case R_LARCH_TLS_LD_PC_HI20:
   case R_LARCH_TLS_GD_PC_HI20:
-    return R_LOONGARCH_TLSGD_PAGE_PC;
+    return RE_LOONGARCH_TLSGD_PAGE_PC;
   case R_LARCH_PCALA_HI20:
-    // Why not R_LOONGARCH_PAGE_PC, majority of references don't go through PLT
-    // anyway so why waste time checking only to get everything relaxed back to
-    // it?
+    // Why not RE_LOONGARCH_PAGE_PC, majority of references don't go through
+    // PLT anyway so why waste time checking only to get everything relaxed back
+    // to it?
     //
     // This is again due to the R_LARCH_PCALA_LO12 on JIRL case, where we want
     // both the HI20 and LO12 to potentially refer to the PLT. But in reality
@@ -474,12 +474,12 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
     //
     // So, unfortunately we have to again workaround this quirk the same way as
     // BFD: assuming every R_LARCH_PCALA_HI20 is potentially PLT-needing, only
-    // relaxing back to R_LOONGARCH_PAGE_PC if it's known not so at a later
+    // relaxing back to RE_LOONGARCH_PAGE_PC if it's known not so at a later
     // stage.
-    return R_LOONGARCH_PLT_PAGE_PC;
+    return RE_LOONGARCH_PLT_PAGE_PC;
   case R_LARCH_PCALA64_LO20:
   case R_LARCH_PCALA64_HI12:
-    return R_LOONGARCH_PAGE_PC;
+    return RE_LOONGARCH_PAGE_PC;
   case R_LARCH_GOT_HI20:
   case R_LARCH_GOT_LO12:
   case R_LARCH_GOT64_LO20:
@@ -501,7 +501,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_TLS_DESC_PC_HI20:
   case R_LARCH_TLS_DESC64_PC_LO20:
   case R_LARCH_TLS_DESC64_PC_HI12:
-    return R_LOONGARCH_TLSDESC_PAGE_PC;
+    return RE_LOONGARCH_TLSDESC_PAGE_PC;
   case R_LARCH_TLS_DESC_PC_LO12:
   case R_LARCH_TLS_DESC_LD:
   case R_LARCH_TLS_DESC_HI20:
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index da76820de240d..121127ae6b21d 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -105,7 +105,7 @@ RelExpr MIPS<ELFT>::getRelExpr(RelType type, const Symbol &s,
   case R_MIPS_GPREL32:
   case R_MICROMIPS_GPREL16:
   case R_MICROMIPS_GPREL7_S2:
-    return R_MIPS_GOTREL;
+    return RE_MIPS_GOTREL;
   case R_MIPS_26:
   case R_MICROMIPS_26_S1:
     return R_PLT;
@@ -122,9 +122,9 @@ RelExpr MIPS<ELFT>::getRelExpr(RelType type, const Symbol &s,
     // equal to the start of .got section. In that case we consider these
     // relocations as relative.
     if (&s == ctx.sym.mipsGpDisp)
-      return R_MIPS_GOT_GP_PC;
+      return RE_MIPS_GOT_GP_PC;
     if (&s == ctx.sym.mipsLocalGp)
-      return R_MIPS_GOT_GP;
+      return RE_MIPS_GOT_GP;
     [[fallthrough]];
   case R_MIPS_32:
   case R_MIPS_64:
@@ -163,14 +163,14 @@ RelExpr MIPS<ELFT>::getRelExpr(RelType type, const Symbol &s,
   case R_MIPS_GOT16:
   case R_MICROMIPS_GOT16:
     if (s.isLocal())
-      return R_MIPS_GOT_LOCAL_PAGE;
+      return RE_MIPS_GOT_LOCAL_PAGE;
     [[fallthrough]];
   case R_MIPS_CALL16:
   case R_MIPS_GOT_DISP:
   case R_MIPS_TLS_GOTTPREL:
   case R_MICROMIPS_CALL16:
   case R_MICROMIPS_TLS_GOTTPREL:
-    return R_MIPS_GOT_OFF;
+    return RE_MIPS_GOT_OFF;
   case R_MIPS_CALL_HI16:
   case R_MIPS_CALL_LO16:
   case R_MIPS_GOT_HI16:
@@ -179,15 +179,15 @@ RelExpr MIPS<ELFT>::getRelExpr(RelType type, const Symbol &s,
   case R_MICROMIPS_CALL_LO16:
   case R_MICROMIPS_GOT_HI16:
   case R_MICROMIPS_GOT_LO16:
-    return R_MIPS_GOT_OFF32;
+    return RE_MIPS_GOT_OFF32;
   case R_MIPS_GOT_PAGE:
-    return R_MIPS_GOT_LOCAL_PAGE;
+    return RE_MIPS_GOT_LOCAL_PAGE;
   case R_MIPS_TLS_GD:
   case R_MICROMIPS_TLS_GD:
-    return R_MIPS_TLSGD;
+    return RE_MIPS_TLSGD;
   case R_MIPS_TLS_LDM:
   case R_MICROMIPS_TLS_LDM:
-    return R_MIPS_TLSLD;
+    return RE_MIPS_TLSLD;
   case R_MIPS_NONE:
     return R_NONE;
   default:
diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index 417401374436a..3203e27d82fa2 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -250,7 +250,7 @@ RelExpr PPC::getRelExpr(RelType type, const Symbol &s,
   case R_PPC_REL24:
     return R_PLT_PC;
   case R_PPC_PLTREL24:
-    return R_PPC32_PLTREL;
+    return RE_PPC32_PLTREL;
   case R_PPC_GOT_TLSGD16:
     return R_TLSGD_GOT;
   case R_PPC_GOT_TLSLD16:
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index b55385625a1cf..4edb6af4f09ad 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -1029,12 +1029,12 @@ RelExpr PPC64::getRelExpr(RelType type, const Symbol &s,
     return R_GOT_PC;
   case R_PPC64_TOC16_HA:
   case R_PPC64_TOC16_LO_DS:
-    return ctx.arg.tocOptimize ? R_PPC64_RELAX_TOC : R_GOTREL;
+    return ctx.arg.tocOptimize ? RE_PPC64_RELAX_TOC : R_GOTREL;
   case R_PPC64_TOC:
-    return R_PPC64_TOCBASE;
+    return RE_PPC64_TOCBASE;
   case R_PPC64_REL14:
   case R_PPC64_REL24:
-    return R_PPC64_CALL_PLT;
+    return RE_PPC64_CALL_PLT;
   case R_PPC64_REL24_NOTOC:
     return R_PLT_PC;
   case R_PPC64_REL16_LO:
@@ -1452,7 +1452,7 @@ bool PPC64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
 
   // If the offset exceeds the range of the branch type then it will need
   // a range-extending thunk.
-  // See the comment in getRelocTargetVA() about R_PPC64_CALL.
+  // See the comment in getRelocTargetVA() about RE_PPC64_CALL.
   return !inBranchRange(
       type, branchAddr,
       s.getVA(ctx, a) + getPPC64GlobalEntryToLocalEntryOffset(ctx, s.stOther));
@@ -1490,7 +1490,7 @@ RelExpr PPC64::adjustGotPcExpr(RelType type, int64_t addend,
     // It only makes sense to optimize pld since paddi means that the address
     // of the object in the GOT is required rather than the object itself.
     if ((readPrefixedInst(ctx, loc) & 0xfc000000) == 0xe4000000)
-      return R_PPC64_RELAX_GOT_PC;
+      return RE_PPC64_RELAX_GOT_PC;
   }
   return R_GOT_PC;
 }
@@ -1574,7 +1574,7 @@ void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
     uint8_t *loc = buf + rel.offset;
     const uint64_t val = sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset);
     switch (rel.expr) {
-    case R_PPC64_RELAX_GOT_PC: {
+    case RE_PPC64_RELAX_GOT_PC: {
       // The R_PPC64_PCREL_OPT relocation must appear immediately after
       // R_PPC64_GOT_PCREL34 in the relocations table at the same offset.
       // We can only relax R_PPC64_PCREL_OPT if we have also relaxed
@@ -1588,7 +1588,7 @@ void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
       relaxGot(loc, rel, val);
       break;
     }
-    case R_PPC64_RELAX_TOC:
+    case RE_PPC64_RELAX_TOC:
       // rel.sym refers to the STT_SECTION symbol associated to the .toc input
       // section. If an R_PPC64_TOC16_LO (.toc + addend) references the TOC
       // entry, there may be R_PPC64_TOC16_HA not paired with
@@ -1598,7 +1598,7 @@ void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
           !tryRelaxPPC64TocIndirection(ctx, rel, loc))
         relocate(loc, rel, val);
       break;
-    case R_PPC64_CALL:
+    case RE_PPC64_CALL:
       // If this is a call to __tls_get_addr, it may be part of a TLS
       // sequence that has been relaxed and turned into a nop. In this
       // case, we don't want to handle it as a call.
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 58a71fd9545c5..e150ff26fc3b5 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -282,7 +282,7 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s,
   case R_RISCV_SUB16:
   case R_RISCV_SUB32:
   case R_RISCV_SUB64:
-    return R_RISCV_ADD;
+    return RE_RISCV_ADD;
   case R_RISCV_JAL:
   case R_RISCV_BRANCH:
   case R_RISCV_PCREL_HI20:
@@ -299,7 +299,7 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s,
     return R_GOT_PC;
   case R_RISCV_PCREL_LO12_I:
   case R_RISCV_PCREL_LO12_S:
-    return R_RISCV_PC_INDIRECT;
+    return RE_RISCV_PC_INDIRECT;
   case R_RISCV_TLSDESC_HI20:
   case R_RISCV_TLSDESC_LOAD_LO12:
   case R_RISCV_TLSDESC_ADD_LO12:
@@ -321,7 +321,7 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s,
     return ctx.arg.relax ? R_RELAX_HINT : R_NONE;
   case R_RISCV_SET_ULEB128:
   case R_RISCV_SUB_ULEB128:
-    return R_RISCV_LEB128;
+    return RE_RISCV_LEB128;
   default:
     Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
              << ") against symbol " << &s;
@@ -650,7 +650,7 @@ void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
       else
         tlsdescToIe(ctx, loc, rel, val);
       continue;
-    case R_RISCV_LEB128:
+    case RE_RISCV_LEB128:
       if (i + 1 < size) {
         const Relocation &rel1 = relocs[i + 1];
         if (rel.type == R_RISCV_SET_ULEB128 &&
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 30c2ff4d79ba5..7e76bae19fc6a 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -523,7 +523,7 @@ void InputSection::copyRelocations(Ctx &ctx, uint8_t *buf,
         addend = target.getImplicitAddend(bufLoc, type);
 
       if (ctx.arg.emachine == EM_MIPS &&
-          target.getRelExpr(type, sym, bufLoc) == R_MIPS_GOTREL) {
+          target.getRelExpr(type, sym, bufLoc) == RE_MIPS_GOTREL) {
         // Some MIPS relocations depend on "gp" value. By default,
         // this value has 0x7ff0 offset from a .got section. But
         // relocatable files produced by a compiler or a linker
@@ -655,7 +655,7 @@ static uint64_t getARMStaticBase(const Symbol &sym) {
   return os->ptLoad->firstSec->addr;
 }
 
-// For R_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}), the symbol actually
+// For RE_RISCV_PC_INDIRECT (R_RISCV_PCREL_LO12_{I,S}), the symbol actually
 // points the corresponding R_RISCV_PCREL_HI20 relocation, and the target VA
 // is calculated using PCREL_HI20's symbol.
 //
@@ -772,25 +772,25 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
   case R_DTPREL:
   case R_RELAX_TLS_LD_TO_LE_ABS:
   case R_RELAX_GOT_PC_NOPIC:
-  case R_AARCH64_AUTH:
-  case R_RISCV_ADD:
-  case R_RISCV_LEB128:
+  case RE_AARCH64_AUTH:
+  case RE_RISCV_ADD:
+  case RE_RISCV_LEB128:
     return r.sym->getVA(ctx, a);
   case R_ADDEND:
     return a;
   case R_RELAX_HINT:
     return 0;
-  case R_ARM_SBREL:
+  case RE_ARM_SBREL:
     return r.sym->getVA(ctx, a) - getARMStaticBase(*r.sym);
   case R_GOT:
   case R_RELAX_TLS_GD_TO_IE_ABS:
     return r.sym->getGotVA(ctx) + a;
-  case R_LOONGARCH_GOT:
+  case RE_LOONGARCH_GOT:
     // The LoongArch TLS GD relocs reuse the R_LARCH_GOT_PC_LO12 reloc r.type
     // for their page offsets. The arithmetics are different in the TLS case
     // so we have to duplicate some logic here.
     if (r.sym->hasFlag(NEEDS_TLSGD) && r.type != R_LARCH_TLS_IE_PC_LO12)
-      // Like R_LOONGARCH_TLSGD_PAGE_PC but taking the absolute value.
+      // Like RE_LOONGARCH_TLSGD_PAGE_PC but taking the absolute value.
       return ctx.in.got->getGlobalDynAddr(*r.sym) + a;
     return r.sym->getGotVA(ctx) + a;
   case R_GOTONLY_PC:
@@ -798,7 +798,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
   case R_GOTPLTONLY_PC:
     return ctx.in.gotPlt->getVA() + a - p;
   case R_GOTREL:
-  case R_PPC64_RELAX_TOC:
+  case RE_PPC64_RELAX_TOC:
     return r.sym->getVA(ctx, a) - ctx.in.got->getVA();
   case R_GOTPLTREL:
     return r.sym->getVA(ctx, a) - ctx.in.gotPlt->getVA();
@@ -809,10 +809,10 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
   case R_GOT_OFF:
   case R_RELAX_TLS_GD_TO_IE_GOT_OFF:
     return r.sym->getGotOffset(ctx) + a;
-  case R_AARCH64_GOT_PAGE_PC:
-  case R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
+  case RE_AARCH64_GOT_PAGE_PC:
+  case RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
     return getAArch64Page(r.sym->getGotVA(ctx) + a) - getAArch64Page(p);
-  case R_AARCH64_GOT_PAGE:
+  case RE_AARCH64_GOT_PAGE:
     return r.sym->getGotVA(ctx) + a - getAArch64Page(ctx.in.got->getVA());
   case R_GOT_PC:
   case R_RELAX_TLS_GD_TO_IE:
@@ -821,17 +821,17 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
     return r.sym->getGotPltVA(ctx) + a - ctx.in.got->getVA();
   case R_GOTPLT_PC:
     return r.sym->getGotPltVA(ctx) + a - p;
-  case R_LOONGARCH_GOT_PAGE_PC:
+  case RE_LOONGARCH_GOT_PAGE_PC:
     if (r.sym->hasFlag(NEEDS_TLSGD))
       return getLoongArchPageDelta(ctx.in.got->getGlobalDynAddr(*r.sym) + a, p,
                                    r.type);
     return getLoongArchPageDelta(r.sym->getGotVA(ctx) + a, p, r.type);
-  case R_MIPS_GOTREL:
+  case RE_MIPS_GOTREL:
     return r.sym->getVA(ctx, a) - ctx.in.mipsGot->getGp(file);
-  case R_MIPS_GOT_GP:
+  case RE_MIPS_GOT_GP:
     return ctx.in.mipsGot->getGp(file) + a;
-  case R_MIPS_GOT_GP_PC: {
-    // R_MIPS_LO16 expression has R_MIPS_GOT_GP_PC r.type iif the target
+  case RE_MIPS_GOT_GP_PC: {
+    // R_MIPS_LO16 expression has RE_MIPS_GOT_GP_PC r.type iif the target
     // is _gp_disp symbol. In that case we should use the following
     // formula for calculation "AHL + GP - P + 4". For details see p. 4-19 at
     // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
@@ -845,43 +845,43 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
       v -= 1;
     return v;
   }
-  case R_MIPS_GOT_LOCAL_PAGE:
+  case RE_MIPS_GOT_LOCAL_PAGE:
     // If relocation against MIPS local symbol requires GOT entry, this entry
     // should be initialized by 'page address'. This address is high 16-bits
     // of sum the symbol's value and the addend.
     return ctx.in.mipsGot->getVA() +
            ctx.in.mipsGot->getPageEntryOffset(file, *r.sym, a) -
            ctx.in.mipsGot->getGp(file);
-  case R_MIPS_GOT_OFF:
-  case R_MIPS_GOT_OFF32:
+  case RE_MIPS_GOT_OFF:
+  case RE_MIPS_GOT_OFF32:
     // In case of MIPS if a GOT relocation has non-zero addend this addend
     // should be applied to the GOT entry content not to the GOT entry offset.
     // That is why we use separate expression r.type.
     return ctx.in.mipsGot->getVA() +
            ctx.in.mipsGot->getSymEntryOffset(file, *r.sym, a) -
            ctx.in.mipsGot->getGp(file);
-  case R_MIPS_TLSGD:
+  case RE_MIPS_TLSGD:
     return ctx.in.mipsGot->getVA() +
            ctx.in.mipsGot->getGlobalDynOffset(file, *r.sym) -
            ctx.in.mipsGot->getGp(file);
-  case R_MIPS_TLSLD:
+  case RE_MIPS_TLSLD:
     return ctx.in.mipsGot->getVA() + ctx.in.mipsGot->getTlsIndexOffset(file) -
            ctx.in.mipsGot->getGp(file);
-  case R_AARCH64_PAGE_PC: {
+  case RE_AARCH64_PAGE_PC: {
     uint64_t val = r.sym->isUndefWeak() ? p + a : r.sym->getVA(ctx, a);
     return getAArch64Page(val) - getAArch64Page(p);
   }
-  case R_RISCV_PC_INDIRECT: {
+  case RE_RISCV_PC_INDIRECT: {
     if (const Relocation *hiRel = getRISCVPCRelHi20(ctx, this, r))
       return getRelocTargetVA(ctx, *hiRel, r.sym->getVA(ctx));
     return 0;
   }
-  case R_LOONGARCH_PAGE_PC:
+  case RE_LOONGARCH_PAGE_PC:
     return getLoongArchPageDelta(r.sym->getVA(ctx, a), p, r.type);
   case R_PC:
-  case R_ARM_PCA: {
+  case RE_ARM_PCA: {
     uint64_t dest;
-    if (r.expr == R_ARM_PCA)
+    if (r.expr == RE_ARM_PCA)
       // Some PC relative ARM (Thumb) relocations align down the place.
       p = p & 0xfffffffc;
     if (r.sym->isUndefined()) {
@@ -909,20 +909,20 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
   case R_PLT:
     return r.sym->getPltVA(ctx) + a;
   case R_PLT_PC:
-  case R_PPC64_CALL_PLT:
+  case RE_PPC64_CALL_PLT:
     return r.sym->getPltVA(ctx) + a - p;
-  case R_LOONGARCH_PLT_PAGE_PC:
+  case RE_LOONGARCH_PLT_PAGE_PC:
     return getLoongArchPageDelta(r.sym->getPltVA(ctx) + a, p, r.type);
   case R_PLT_GOTPLT:
     return r.sym->getPltVA(ctx) + a - ctx.in.gotPlt->getVA();
   case R_PLT_GOTREL:
     return r.sym->getPltVA(ctx) + a - ctx.in.got->getVA();
-  case R_PPC32_PLTREL:
+  case RE_PPC32_PLTREL:
     // R_PPC_PLTREL24 uses the addend (usually 0 or 0x8000) to indicate r30
     // stores _GLOBAL_OFFSET_TABLE_ or .got2+0x8000. The addend is ignored for
     // target VA computation.
     return r.sym->getPltVA(ctx) - p;
-  case R_PPC64_CALL: {
+  case RE_PPC64_CALL: {
     uint64_t symVA = r.sym->getVA(ctx, a);
     // If we have an undefined weak symbol, we might get here with a symbol
     // address of zero. That could overflow, but the code must be unreachable,
@@ -939,10 +939,10 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
     return symVA - p +
            getPPC64GlobalEntryToLocalEntryOffset(ctx, r.sym->stOther);
   }
-  case R_PPC64_TOCBASE:
+  case RE_PPC64_TOCBASE:
     return getPPC64TocBase(ctx) + a;
   case R_RELAX_GOT_PC:
-  case R_PPC64_RELAX_GOT_PC:
+  case RE_PPC64_RELAX_GOT_PC:
     return r.sym->getVA(ctx, a) - p;
   case R_RELAX_TLS_GD_TO_LE:
   case R_RELAX_TLS_IE_TO_LE:
@@ -968,10 +968,10 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
     return ctx.in.got->getTlsDescAddr(*r.sym) + a - p;
   case R_TLSDESC_GOTPLT:
     return ctx.in.got->getTlsDescAddr(*r.sym) + a - ctx.in.gotPlt->getVA();
-  case R_AARCH64_TLSDESC_PAGE:
+  case RE_AARCH64_TLSDESC_PAGE:
     return getAArch64Page(ctx.in.got->getTlsDescAddr(*r.sym) + a) -
            getAArch64Page(p);
-  case R_LOONGARCH_TLSDESC_PAGE_PC:
+  case RE_LOONGARCH_TLSDESC_PAGE_PC:
     return getLoongArchPageDelta(ctx.in.got->getTlsDescAddr(*r.sym) + a, p,
                                  r.type);
   case R_TLSGD_GOT:
@@ -980,7 +980,7 @@ uint64_t InputSectionBase::getRelocTargetVA(Ctx &ctx, const Relocation &r,
     return ctx.in.got->getGlobalDynAddr(*r.sym) + a - ctx.in.gotPlt->getVA();
   case R_TLSGD_PC:
     return ctx.in.got->getGlobalDynAddr(*r.sym) + a - p;
-  case R_LOONGARCH_TLSGD_PAGE_PC:
+  case RE_LOONGARCH_TLSGD_PAGE_PC:
     return getLoongArchPageDelta(ctx.in.got->getGlobalDynAddr(*r.sym) + a, p,
                                  r.type);
   case R_TLSLD_GOTPLT:
@@ -1114,7 +1114,7 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf,
     // R_ABS/R_DTPREL and some other relocations can be used from non-SHF_ALLOC
     // sections.
     if (LLVM_LIKELY(expr == R_ABS) || expr == R_DTPREL || expr == R_GOTPLTREL ||
-        expr == R_RISCV_ADD || expr == R_ARM_SBREL) {
+        expr == RE_RISCV_ADD || expr == RE_ARM_SBREL) {
       target.relocateNoSym(bufLoc, type,
                            SignExtend64<bits>(sym.getVA(ctx, addend)));
       continue;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 605321b3cc9e3..4aa27b0a71bc1 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -205,32 +205,32 @@ static bool isAbsoluteValue(const Symbol &sym) {
 // Returns true if Expr refers a PLT entry.
 static bool needsPlt(RelExpr expr) {
   return oneof<R_PLT, R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL,
-               R_GOTPLT_PC, R_LOONGARCH_PLT_PAGE_PC, R_PPC32_PLTREL,
-               R_PPC64_CALL_PLT>(expr);
+               R_GOTPLT_PC, RE_LOONGARCH_PLT_PAGE_PC, RE_PPC32_PLTREL,
+               RE_PPC64_CALL_PLT>(expr);
 }
 
 bool lld::elf::needsGot(RelExpr expr) {
-  return oneof<R_GOT, R_GOT_OFF, R_MIPS_GOT_LOCAL_PAGE, R_MIPS_GOT_OFF,
-               R_MIPS_GOT_OFF32, R_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTPLT,
-               R_AARCH64_GOT_PAGE, R_LOONGARCH_GOT, R_LOONGARCH_GOT_PAGE_PC>(
+  return oneof<R_GOT, R_GOT_OFF, RE_MIPS_GOT_LOCAL_PAGE, RE_MIPS_GOT_OFF,
+               RE_MIPS_GOT_OFF32, RE_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTPLT,
+               RE_AARCH64_GOT_PAGE, RE_LOONGARCH_GOT, RE_LOONGARCH_GOT_PAGE_PC>(
       expr);
 }
 
 // True if this expression is of the form Sym - X, where X is a position in the
 // file (PC, or GOT for example).
 static bool isRelExpr(RelExpr expr) {
-  return oneof<R_PC, R_GOTREL, R_GOTPLTREL, R_ARM_PCA, R_MIPS_GOTREL,
-               R_PPC64_CALL, R_PPC64_RELAX_TOC, R_AARCH64_PAGE_PC,
-               R_RELAX_GOT_PC, R_RISCV_PC_INDIRECT, R_PPC64_RELAX_GOT_PC,
-               R_LOONGARCH_PAGE_PC>(expr);
+  return oneof<R_PC, R_GOTREL, R_GOTPLTREL, RE_ARM_PCA, RE_MIPS_GOTREL,
+               RE_PPC64_CALL, RE_PPC64_RELAX_TOC, RE_AARCH64_PAGE_PC,
+               R_RELAX_GOT_PC, RE_RISCV_PC_INDIRECT, RE_PPC64_RELAX_GOT_PC,
+               RE_LOONGARCH_PAGE_PC>(expr);
 }
 
 static RelExpr toPlt(RelExpr expr) {
   switch (expr) {
-  case R_LOONGARCH_PAGE_PC:
-    return R_LOONGARCH_PLT_PAGE_PC;
-  case R_PPC64_CALL:
-    return R_PPC64_CALL_PLT;
+  case RE_LOONGARCH_PAGE_PC:
+    return RE_LOONGARCH_PLT_PAGE_PC;
+  case RE_PPC64_CALL:
+    return RE_PPC64_CALL_PLT;
   case R_PC:
     return R_PLT_PC;
   case R_ABS:
@@ -247,12 +247,12 @@ static RelExpr fromPlt(RelExpr expr) {
   // reference to the symbol itself.
   switch (expr) {
   case R_PLT_PC:
-  case R_PPC32_PLTREL:
+  case RE_PPC32_PLTREL:
     return R_PC;
-  case R_LOONGARCH_PLT_PAGE_PC:
-    return R_LOONGARCH_PAGE_PC;
-  case R_PPC64_CALL_PLT:
-    return R_PPC64_CALL;
+  case RE_LOONGARCH_PLT_PAGE_PC:
+    return RE_LOONGARCH_PAGE_PC;
+  case RE_PPC64_CALL_PLT:
+    return RE_PPC64_CALL;
   case R_PLT:
     return R_ABS;
   case R_PLT_GOTPLT:
@@ -495,7 +495,7 @@ class RelocationScanner {
 template <class ELFT, class RelTy>
 int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr,
                                              bool isLocal) const {
-  if (expr == R_MIPS_GOTREL && isLocal)
+  if (expr == RE_MIPS_GOTREL && isLocal)
     return sec->getFile<ELFT>()->mipsGp0;
 
   // The ABI says that the paired relocation is used only for REL.
@@ -969,13 +969,14 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
                                                  const Symbol &sym,
                                                  uint64_t relOff) const {
   // These expressions always compute a constant
-  if (oneof<R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, R_MIPS_GOT_LOCAL_PAGE,
-            R_MIPS_GOTREL, R_MIPS_GOT_OFF, R_MIPS_GOT_OFF32, R_MIPS_GOT_GP_PC,
-            R_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTONLY_PC, R_GOTPLTONLY_PC,
-            R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT, R_GOTPLT_GOTREL, R_GOTPLT_PC,
-            R_PPC32_PLTREL, R_PPC64_CALL_PLT, R_PPC64_RELAX_TOC, R_RISCV_ADD,
-            R_AARCH64_GOT_PAGE, R_LOONGARCH_PLT_PAGE_PC, R_LOONGARCH_GOT,
-            R_LOONGARCH_GOT_PAGE_PC>(e))
+  if (oneof<R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, RE_MIPS_GOT_LOCAL_PAGE,
+            RE_MIPS_GOTREL, RE_MIPS_GOT_OFF, RE_MIPS_GOT_OFF32,
+            RE_MIPS_GOT_GP_PC, RE_AARCH64_GOT_PAGE_PC, R_GOT_PC, R_GOTONLY_PC,
+            R_GOTPLTONLY_PC, R_PLT_PC, R_PLT_GOTREL, R_PLT_GOTPLT,
+            R_GOTPLT_GOTREL, R_GOTPLT_PC, RE_PPC32_PLTREL, RE_PPC64_CALL_PLT,
+            RE_PPC64_RELAX_TOC, RE_RISCV_ADD, RE_AARCH64_GOT_PAGE,
+            RE_LOONGARCH_PLT_PAGE_PC, RE_LOONGARCH_GOT,
+            RE_LOONGARCH_GOT_PAGE_PC>(e))
     return true;
 
   // These never do, except if the entire file is position dependent or if
@@ -984,13 +985,13 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
     return ctx.target->usesOnlyLowPageBits(type) || !ctx.arg.isPic;
 
   // R_AARCH64_AUTH_ABS64 requires a dynamic relocation.
-  if (sym.isPreemptible || e == R_AARCH64_AUTH)
+  if (sym.isPreemptible || e == RE_AARCH64_AUTH)
     return false;
   if (!ctx.arg.isPic)
     return true;
 
   // Constant when referencing a non-preemptible symbol.
-  if (e == R_SIZE || e == R_RISCV_LEB128)
+  if (e == R_SIZE || e == RE_RISCV_LEB128)
     return true;
 
   // For the target and the relocation, we want to know if they are
@@ -1047,7 +1048,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
     if (expr != R_GOT_PC) {
       // The 0x8000 bit of r_addend of R_PPC_PLTREL24 is used to choose call
       // stub type. It should be ignored if optimized to R_PC.
-      if (ctx.arg.emachine == EM_PPC && expr == R_PPC32_PLTREL)
+      if (ctx.arg.emachine == EM_PPC && expr == RE_PPC32_PLTREL)
         addend &= ~0x8000;
       // R_HEX_GD_PLT_B22_PCREL (call a@GDPLT) is transformed into
       // call __tls_get_addr even if the symbol is non-preemptible.
@@ -1087,7 +1088,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
       // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
       ctx.in.mipsGot->addEntry(*sec->file, sym, addend, expr);
     } else if (!sym.isTls() || ctx.arg.emachine != EM_LOONGARCH) {
-      // Many LoongArch TLS relocs reuse the R_LOONGARCH_GOT type, in which
+      // Many LoongArch TLS relocs reuse the RE_LOONGARCH_GOT type, in which
       // case the NEEDS_GOT flag shouldn't get set.
       sym.setFlags(NEEDS_GOT);
     }
@@ -1128,7 +1129,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
                     (isa<EhInputSection>(sec) && ctx.arg.emachine != EM_MIPS));
   if (canWrite) {
     RelType rel = ctx.target->getDynRel(type);
-    if (oneof<R_GOT, R_LOONGARCH_GOT>(expr) ||
+    if (oneof<R_GOT, RE_LOONGARCH_GOT>(expr) ||
         (rel == ctx.target->symbolicRel && !sym.isPreemptible)) {
       addRelativeReloc<true>(ctx, *sec, offset, sym, addend, expr, type);
       return;
@@ -1267,12 +1268,12 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
 static unsigned handleMipsTlsRelocation(Ctx &ctx, RelType type, Symbol &sym,
                                         InputSectionBase &c, uint64_t offset,
                                         int64_t addend, RelExpr expr) {
-  if (expr == R_MIPS_TLSLD) {
+  if (expr == RE_MIPS_TLSLD) {
     ctx.in.mipsGot->addTlsIndex(*c.file);
     c.addReloc({expr, type, offset, addend, &sym});
     return 1;
   }
-  if (expr == R_MIPS_TLSGD) {
+  if (expr == RE_MIPS_TLSGD) {
     ctx.in.mipsGot->addDynTlsEntry(*c.file, sym);
     c.addReloc({expr, type, offset, addend, &sym});
     return 1;
@@ -1307,7 +1308,7 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
   // LoongArch does not yet implement transition from TLSDESC to LE/IE, so
   // generate TLSDESC dynamic relocation for the dynamic linker to handle.
   if (ctx.arg.emachine == EM_LOONGARCH &&
-      oneof<R_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
+      oneof<RE_LOONGARCH_TLSDESC_PAGE_PC, R_TLSDESC, R_TLSDESC_PC,
             R_TLSDESC_CALL>(expr)) {
     if (expr != R_TLSDESC_CALL) {
       sym.setFlags(NEEDS_TLSDESC);
@@ -1318,7 +1319,7 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
 
   bool isRISCV = ctx.arg.emachine == EM_RISCV;
 
-  if (oneof<R_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
+  if (oneof<RE_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
             R_TLSDESC_GOTPLT>(expr) &&
       ctx.arg.shared) {
     // R_RISCV_TLSDESC_{LOAD_LO12,ADD_LO12_I,CALL} reference a label. Do not
@@ -1387,9 +1388,9 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
     return 1;
   }
 
-  if (oneof<R_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
+  if (oneof<RE_AARCH64_TLSDESC_PAGE, R_TLSDESC, R_TLSDESC_CALL, R_TLSDESC_PC,
             R_TLSDESC_GOTPLT, R_TLSGD_GOT, R_TLSGD_GOTPLT, R_TLSGD_PC,
-            R_LOONGARCH_TLSGD_PAGE_PC>(expr)) {
+            RE_LOONGARCH_TLSGD_PAGE_PC>(expr)) {
     if (!execOptimize) {
       sym.setFlags(NEEDS_TLSGD);
       sec->addReloc({expr, type, offset, addend, &sym});
@@ -1413,8 +1414,8 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
     return ctx.target->getTlsGdRelaxSkip(type);
   }
 
-  if (oneof<R_GOT, R_GOTPLT, R_GOT_PC, R_AARCH64_GOT_PAGE_PC,
-            R_LOONGARCH_GOT_PAGE_PC, R_GOT_OFF, R_TLSIE_HINT>(expr)) {
+  if (oneof<R_GOT, R_GOTPLT, R_GOT_PC, RE_AARCH64_GOT_PAGE_PC,
+            RE_LOONGARCH_GOT_PAGE_PC, R_GOT_OFF, R_TLSIE_HINT>(expr)) {
     ctx.hasTlsIe.store(true, std::memory_order_relaxed);
     // Initial-Exec relocs can be optimized to Local-Exec if the symbol is
     // locally defined.  This is not supported on SystemZ.
@@ -1524,8 +1525,8 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   if (oneof<R_GOTPLTONLY_PC, R_GOTPLTREL, R_GOTPLT, R_PLT_GOTPLT,
             R_TLSDESC_GOTPLT, R_TLSGD_GOTPLT>(expr)) {
     ctx.in.gotPlt->hasGotPltOffRel.store(true, std::memory_order_relaxed);
-  } else if (oneof<R_GOTONLY_PC, R_GOTREL, R_PPC32_PLTREL, R_PPC64_TOCBASE,
-                   R_PPC64_RELAX_TOC>(expr)) {
+  } else if (oneof<R_GOTONLY_PC, R_GOTREL, RE_PPC32_PLTREL, RE_PPC64_TOCBASE,
+                   RE_PPC64_RELAX_TOC>(expr)) {
     ctx.in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
   }
 
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 041bd48048587..71cea0220e04c 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -89,42 +89,42 @@ enum RelExpr {
   //
   // Even though RelExpr is intended to be a target-neutral representation
   // of a relocation type, there are some relocations whose semantics are
-  // unique to a target. Such relocation are marked with R_<TARGET_NAME>.
-  R_AARCH64_GOT_PAGE_PC,
-  R_AARCH64_GOT_PAGE,
-  R_AARCH64_PAGE_PC,
-  R_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC,
-  R_AARCH64_TLSDESC_PAGE,
-  R_AARCH64_AUTH,
-  R_ARM_PCA,
-  R_ARM_SBREL,
-  R_MIPS_GOTREL,
-  R_MIPS_GOT_GP,
-  R_MIPS_GOT_GP_PC,
-  R_MIPS_GOT_LOCAL_PAGE,
-  R_MIPS_GOT_OFF,
-  R_MIPS_GOT_OFF32,
-  R_MIPS_TLSGD,
-  R_MIPS_TLSLD,
-  R_PPC32_PLTREL,
-  R_PPC64_CALL,
-  R_PPC64_CALL_PLT,
-  R_PPC64_RELAX_TOC,
-  R_PPC64_TOCBASE,
-  R_PPC64_RELAX_GOT_PC,
-  R_RISCV_ADD,
-  R_RISCV_LEB128,
-  R_RISCV_PC_INDIRECT,
+  // unique to a target. Such relocation are marked with RE_<TARGET_NAME>.
+  RE_AARCH64_GOT_PAGE_PC,
+  RE_AARCH64_GOT_PAGE,
+  RE_AARCH64_PAGE_PC,
+  RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC,
+  RE_AARCH64_TLSDESC_PAGE,
+  RE_AARCH64_AUTH,
+  RE_ARM_PCA,
+  RE_ARM_SBREL,
+  RE_MIPS_GOTREL,
+  RE_MIPS_GOT_GP,
+  RE_MIPS_GOT_GP_PC,
+  RE_MIPS_GOT_LOCAL_PAGE,
+  RE_MIPS_GOT_OFF,
+  RE_MIPS_GOT_OFF32,
+  RE_MIPS_TLSGD,
+  RE_MIPS_TLSLD,
+  RE_PPC32_PLTREL,
+  RE_PPC64_CALL,
+  RE_PPC64_CALL_PLT,
+  RE_PPC64_RELAX_TOC,
+  RE_PPC64_TOCBASE,
+  RE_PPC64_RELAX_GOT_PC,
+  RE_RISCV_ADD,
+  RE_RISCV_LEB128,
+  RE_RISCV_PC_INDIRECT,
   // Same as R_PC but with page-aligned semantics.
-  R_LOONGARCH_PAGE_PC,
+  RE_LOONGARCH_PAGE_PC,
   // Same as R_PLT_PC but with page-aligned semantics.
-  R_LOONGARCH_PLT_PAGE_PC,
+  RE_LOONGARCH_PLT_PAGE_PC,
   // In addition to having page-aligned semantics, LoongArch GOT relocs are
   // also reused for TLS, making the semantics differ from other architectures.
-  R_LOONGARCH_GOT,
-  R_LOONGARCH_GOT_PAGE_PC,
-  R_LOONGARCH_TLSGD_PAGE_PC,
-  R_LOONGARCH_TLSDESC_PAGE_PC,
+  RE_LOONGARCH_GOT,
+  RE_LOONGARCH_GOT_PAGE_PC,
+  RE_LOONGARCH_TLSGD_PAGE_PC,
+  RE_LOONGARCH_TLSDESC_PAGE_PC,
 };
 
 // Architecture-neutral representation of relocation.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 21fe2a25fa1bd..6c5f2a614639c 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -749,7 +749,7 @@ MipsGotSection::MipsGotSection(Ctx &ctx)
 void MipsGotSection::addEntry(InputFile &file, Symbol &sym, int64_t addend,
                               RelExpr expr) {
   FileGot &g = getGot(file);
-  if (expr == R_MIPS_GOT_LOCAL_PAGE) {
+  if (expr == RE_MIPS_GOT_LOCAL_PAGE) {
     if (const OutputSection *os = sym.getOutputSection())
       g.pagesMap.insert({os, {}});
     else
@@ -760,7 +760,7 @@ void MipsGotSection::addEntry(InputFile &file, Symbol &sym, int64_t addend,
     g.relocs.insert({&sym, 0});
   else if (sym.isPreemptible)
     g.global.insert({&sym, 0});
-  else if (expr == R_MIPS_GOT_OFF32)
+  else if (expr == RE_MIPS_GOT_OFF32)
     g.local32.insert({{&sym, addend}, 0});
   else
     g.local16.insert({{&sym, addend}, 0});
diff --git a/lld/Maintainers.md b/lld/Maintainers.md
new file mode 100644
index 0000000000000..d210246267656
--- /dev/null
+++ b/lld/Maintainers.md
@@ -0,0 +1,40 @@
+# LLD Maintainers
+
+This file is a list of the
+[maintainers](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
+LLD.
+
+## Current Maintainers
+
+The following people are the active maintainers for the project. Please reach
+out to them for code reviews, questions about their area of expertise, or other
+assistance.
+
+### COFF, ELF backends (COFF/* ELF/*)
+
+Rui Ueyama \
+ruiu@google.com (email)
+
+### Old Mach-O backend
+
+Lang Hames \
+lhames@gmail.com (email)
+
+Nick Kledzik \
+kledzik@apple.com (email)
+
+### WebAssembly backend (wasm/*)
+
+Sam Clegg \
+sbc@chromium.org (email)
+
+### New Mach-O backend
+
+Jez Ng \
+jezng@fb.com (email)
+
+Greg McGary \
+gkm@fb.com (email)
+
+Shoaib Meenai \
+smeenai@fb.com (email)
diff --git a/lld/test/ELF/systemz-gotent-relax-und-dso.s b/lld/test/ELF/systemz-gotent-relax-und-dso.s
index 5a1bd7f949f89..e8b88056299cb 100644
--- a/lld/test/ELF/systemz-gotent-relax-und-dso.s
+++ b/lld/test/ELF/systemz-gotent-relax-und-dso.s
@@ -14,9 +14,9 @@
 # DISASM:      Disassembly of section .text:
 # DISASM-EMPTY:
 # DISASM-NEXT: <foo>:
-# DISASM-NEXT:     nop     0
+# DISASM-NEXT:     nop
 # DISASM:      <hid>:
-# DISASM-NEXT:     nop     0
+# DISASM-NEXT:     nop
 # DISASM:      <_start>:
 # DISASM-NEXT:    lgrl    %r1, 0x2400
 # DISASM-NEXT:    lgrl    %r1, 0x2400
diff --git a/lld/test/ELF/systemz-gotent-relax.s b/lld/test/ELF/systemz-gotent-relax.s
index e84fd8d4653e9..88b43a4e9d29c 100644
--- a/lld/test/ELF/systemz-gotent-relax.s
+++ b/lld/test/ELF/systemz-gotent-relax.s
@@ -30,9 +30,9 @@
 
 # DISASM:      Disassembly of section .text:
 # DISASM: 00000000010011e0 <foo>:
-# DISASM-NEXT:   nop 0
+# DISASM-NEXT:   nop
 # DISASM: 00000000010011e4 <hid>:
-# DISASM-NEXT:   nop 0
+# DISASM-NEXT:   nop
 # DISASM: 00000000010011e8 <ifunc>:
 # DISASM-NEXT:   br      %r14
 # DISASM: 00000000010011ea <_start>:
diff --git a/lld/test/ELF/systemz-plt.s b/lld/test/ELF/systemz-plt.s
index c7563cd18c274..717343ce4c4d5 100644
--- a/lld/test/ELF/systemz-plt.s
+++ b/lld/test/ELF/systemz-plt.s
@@ -48,9 +48,9 @@
 # DIS-NEXT: 100102c: d2 07 f0 30 10 08    	mvc	48(8,%r15), 8(%r1)
 # DIS-NEXT: 1001032: e3 10 10 10 00 04    	lg	%r1, 16(%r1)
 # DIS-NEXT: 1001038: 07 f1        	br	%r1
-# DIS-NEXT: 100103a: 07 00        	nopr   %r0
-# DIS-NEXT: 100103c: 07 00        	nopr   %r0
-# DIS-NEXT: 100103e: 07 00        	nopr   %r0
+# DIS-NEXT: 100103a: 07 00        	nopr
+# DIS-NEXT: 100103c: 07 00        	nopr
+# DIS-NEXT: 100103e: 07 00        	nopr
 # DIS-NEXT: 1001040: c0 10 00 00 10 54    	larl	%r1, 0x10030e8
 # DIS-NEXT: 1001046: e3 10 10 00 00 04    	lg	%r1, 0(%r1)
 # DIS-NEXT: 100104c: 07 f1        	br	%r1
diff --git a/lld/test/ELF/systemz-tls-gd.s b/lld/test/ELF/systemz-tls-gd.s
index 3976f55a6ae39..742797e2d62e4 100644
--- a/lld/test/ELF/systemz-tls-gd.s
+++ b/lld/test/ELF/systemz-tls-gd.s
@@ -58,17 +58,17 @@
 
 ## TP offset of a is at 0x1002218
 # LE-NEXT: lgrl    %r2, 0x1002218
-# LE-NEXT: brcl    0,
+# LE-NEXT: jgnop
 # LE-NEXT: lgf     %r2, 0(%r2,%r7)
 
 ## TP offset of b is at 0x1002220
 # LE-NEXT: lgrl    %r2, 0x1002220
-# LE-NEXT: brcl    0,
+# LE-NEXT: jgnop
 # LE-NEXT: lgf     %r2, 0(%r2,%r7)
 
 ## TP offset of c is at 0x1002228
 # LE-NEXT: lgrl    %r2, 0x1002228
-# LE-NEXT: brcl    0,
+# LE-NEXT: jgnop
 # LE-NEXT: lgf     %r2, 0(%r2,%r7)
 
 ## TP offsets
@@ -88,7 +88,7 @@
 
 ## TP offset of a is at 0x1002340
 # IE-NEXT: lgrl    %r2, 0x1002340
-# IE-NEXT: brcl    0,
+# IE-NEXT: jgnop
 # IE-NEXT: lgf     %r2, 0(%r2,%r7)
 
 ## GOT offset of the TP offset for b is at 0x1002348
diff --git a/lld/test/ELF/systemz-tls-ld.s b/lld/test/ELF/systemz-tls-ld.s
index 2cb36d7294f2b..ef104b82644ce 100644
--- a/lld/test/ELF/systemz-tls-ld.s
+++ b/lld/test/ELF/systemz-tls-ld.s
@@ -49,7 +49,7 @@
 
 ## GOT offset of the LDM TLS module ID is at 0x1002210
 # LE-NEXT: lgrl    %r2, 0x1002210
-# LE-NEXT: brcl    0,
+# LE-NEXT: jgnop
 # LE-NEXT: la      %r2, 0(%r2,%r7)
 
 ## TP offset for a is at 0x1002218
diff --git a/lld/test/wasm/compress-relocs.ll b/lld/test/wasm/compress-relocs.ll
index f1faab754cb76..cea9f3476e996 100644
--- a/lld/test/wasm/compress-relocs.ll
+++ b/lld/test/wasm/compress-relocs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llvm-mc -mattr=+reference-types -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
+; RUN: llvm-mc -mattr=+call-indirect-overlong -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
 ; RUN: wasm-ld --export-dynamic -o %t.wasm %t2.o %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s
 ; RUN: wasm-ld --export-dynamic -O2 -o %t-opt.wasm %t2.o %t.o
diff --git a/lld/test/wasm/import-table-explicit.s b/lld/test/wasm/import-table-explicit.s
index 1dc21beba0629..701b7a1dc3e16 100644
--- a/lld/test/wasm/import-table-explicit.s
+++ b/lld/test/wasm/import-table-explicit.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -mattr=+reference-types -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
+# RUN: llvm-mc -mattr=+call-indirect-overlong -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
 # RUN: wasm-ld --import-table -o %t.wasm %t.o
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
diff --git a/lld/test/wasm/invalid-mvp-table-use.s b/lld/test/wasm/invalid-mvp-table-use.s
index b4f12a7eeb9a4..58c472e29d1ad 100644
--- a/lld/test/wasm/invalid-mvp-table-use.s
+++ b/lld/test/wasm/invalid-mvp-table-use.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
 #
 # If any table is defined or declared besides the __indirect_function_table,
-# the compilation unit should be compiled with -mattr=+reference-types,
+# the compilation unit should be compiled with -mattr=+call-indirect-overlong,
 # causing symbol table entries to be emitted for all tables.
 # RUN: not wasm-ld --no-entry %t.o -o %t.wasm 2>&1 | FileCheck -check-prefix=CHECK-ERR %s
 
diff --git a/lld/test/wasm/lto/Inputs/libcall-archive.ll b/lld/test/wasm/lto/Inputs/libcall-archive.ll
index 7d8c34196dfe4..30764af83e673 100644
--- a/lld/test/wasm/lto/Inputs/libcall-archive.ll
+++ b/lld/test/wasm/lto/Inputs/libcall-archive.ll
@@ -5,4 +5,4 @@ define void @memcpy() #0 {
   ret void
 }
 
-attributes #0 = { "target-features"="-bulk-memory" }
+attributes #0 = { "target-features"="-bulk-memory,-bulk-memory-opt" }
diff --git a/lld/test/wasm/lto/libcall-archive.ll b/lld/test/wasm/lto/libcall-archive.ll
index 5c46d2f7ed783..0cee9a5de29f6 100644
--- a/lld/test/wasm/lto/libcall-archive.ll
+++ b/lld/test/wasm/lto/libcall-archive.ll
@@ -16,7 +16,7 @@ entry:
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
 
-attributes #0 = { "target-features"="-bulk-memory" }
+attributes #0 = { "target-features"="-bulk-memory,-bulk-memory-opt" }
 
 ; CHECK:       - Type:            CUSTOM
 ; CHECK-NEXT:    Name:            name
diff --git a/lld/test/wasm/lto/stub-library-libcall.s b/lld/test/wasm/lto/stub-library-libcall.s
index d65983c0cf5bf..40e15933f7bc3 100644
--- a/lld/test/wasm/lto/stub-library-libcall.s
+++ b/lld/test/wasm/lto/stub-library-libcall.s
@@ -2,7 +2,7 @@
 # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t_main.o %t/main.s
 # RUN: llvm-as %S/Inputs/foo.ll -o %t_foo.o
 # RUN: llvm-as %S/Inputs/libcall.ll -o %t_libcall.o
-# RUN: wasm-ld -mllvm -mattr=-bulk-memory %t_main.o %t_libcall.o %t_foo.o %p/Inputs/stub.so -o %t.wasm
+# RUN: wasm-ld -mllvm -mattr=-bulk-memory,-bulk-memory-opt %t_main.o %t_libcall.o %t_foo.o %p/Inputs/stub.so -o %t.wasm
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
 # The function `func_with_libcall` will generate an undefined reference to
@@ -12,7 +12,7 @@
 # If %t_foo.o is not included in the link we get an undefined symbol reported
 # to the dependency of memcpy on the foo export:
 
-# RUN: not wasm-ld -mllvm -mattr=-bulk-memory %t_main.o %t_libcall.o %p/Inputs/stub.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING %s
+# RUN: not wasm-ld -mllvm -mattr=-bulk-memory,-bulk-memory-opt %t_main.o %t_libcall.o %p/Inputs/stub.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING %s
 # MISSING: stub.so: undefined symbol: foo. Required by memcpy
 
 #--- main.s
diff --git a/lld/test/wasm/multi-table.s b/lld/test/wasm/multi-table.s
index bf905ac748f9f..afe8ddac49768 100644
--- a/lld/test/wasm/multi-table.s
+++ b/lld/test/wasm/multi-table.s
@@ -26,7 +26,7 @@ call_indirect_explicit_tables:
   call_indirect table_b, () -> ()
   end_function
 
-# RT-MVP: wasm-ld: error: object file not built with 'reference-types' feature conflicts with import of table table_a by file
+# RT-MVP: wasm-ld: error: object file not built with 'reference-types' or 'call-indirect-overlong' feature conflicts with import of table table_a by file
 
 # CHECK:      --- !WASM
 # CHECK-NEXT: FileHeader:
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 8d01ff839ddfc..37a0156c728f6 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -59,6 +59,7 @@ void Ctx::reset() {
   stubFiles.clear();
   sharedFiles.clear();
   bitcodeFiles.clear();
+  lazyBitcodeFiles.clear();
   syntheticFunctions.clear();
   syntheticGlobals.clear();
   syntheticTables.clear();
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index fd06788457966..221f02aa1c157 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -255,13 +255,14 @@ static void setRelocs(const std::vector<T *> &chunks,
   }
 }
 
-// An object file can have two approaches to tables.  With the reference-types
-// feature enabled, input files that define or use tables declare the tables
-// using symbols, and record each use with a relocation.  This way when the
-// linker combines inputs, it can collate the tables used by the inputs,
-// assigning them distinct table numbers, and renumber all the uses as
-// appropriate.  At the same time, the linker has special logic to build the
-// indirect function table if it is needed.
+// An object file can have two approaches to tables.  With the
+// reference-types feature or call-indirect-overlong feature enabled
+// (explicitly, or implied by the reference-types feature), input files that
+// define or use tables declare the tables using symbols, and record each use
+// with a relocation.  This way when the linker combines inputs, it can collate
+// the tables used by the inputs, assigning them distinct table numbers, and
+// renumber all the uses as appropriate.  At the same time, the linker has
+// special logic to build the indirect function table if it is needed.
 //
 // However, MVP object files (those that target WebAssembly 1.0, the "minimum
 // viable product" version of WebAssembly) neither write table symbols nor
@@ -284,9 +285,9 @@ void ObjFile::addLegacyIndirectFunctionTableIfNeeded(
     return;
 
   // It's possible for an input to define tables and also use the indirect
-  // function table, but forget to compile with -mattr=+reference-types.
-  // For these newer files, we require symbols for all tables, and
-  // relocations for all of their uses.
+  // function table, but forget to compile with -mattr=+call-indirect-overlong
+  // or -mattr=+reference-types. For these newer files, we require symbols for
+  // all tables, and relocations for all of their uses.
   if (tableSymbolCount != 0) {
     error(toString(this) +
           ": expected one symbol table entry for each of the " +
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 1454c3324af98..6b32d12ebeb45 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -326,8 +326,9 @@ void TableSection::addTable(InputTable *table) {
       // to assign table number 0 to the indirect function table.
       for (const auto *culprit : out.importSec->importedSymbols) {
         if (isa<UndefinedTable>(culprit)) {
-          error("object file not built with 'reference-types' feature "
-                "conflicts with import of table " +
+          error("object file not built with 'reference-types' or "
+                "'call-indirect-overlong' feature conflicts with import of "
+                "table " +
                 culprit->getName() + " by file " +
                 toString(culprit->getFile()));
           return;
diff --git a/lldb/bindings/interface/SBMemoryRegionInfoListExtensions.i b/lldb/bindings/interface/SBMemoryRegionInfoListExtensions.i
index 49d49110de7ff..29c0179c0ffe3 100644
--- a/lldb/bindings/interface/SBMemoryRegionInfoListExtensions.i
+++ b/lldb/bindings/interface/SBMemoryRegionInfoListExtensions.i
@@ -7,7 +7,12 @@
 
     def __iter__(self):
       '''Iterate over all the memory regions in a lldb.SBMemoryRegionInfoList object.'''
-      return lldb_iter(self, 'GetSize', 'GetMemoryRegionAtIndex')
+      import lldb
+      size = self.GetSize()
+      region = lldb.SBMemoryRegionInfo()
+      for i in range(size):
+        self.GetMemoryRegionAtIndex(i, region)
+        yield region
     %}
 #endif
 }
diff --git a/lldb/include/lldb/API/SBAddressRangeList.h b/lldb/include/lldb/API/SBAddressRangeList.h
index 5a4eeecf37dc9..41085b1edf8d7 100644
--- a/lldb/include/lldb/API/SBAddressRangeList.h
+++ b/lldb/include/lldb/API/SBAddressRangeList.h
@@ -45,6 +45,7 @@ class LLDB_API SBAddressRangeList {
 private:
   friend class SBBlock;
   friend class SBProcess;
+  friend class SBFunction;
 
   lldb_private::AddressRangeListImpl &ref() const;
 
diff --git a/lldb/include/lldb/API/SBFunction.h b/lldb/include/lldb/API/SBFunction.h
index df607fdc7ebf5..0a8aeeff1ea5a 100644
--- a/lldb/include/lldb/API/SBFunction.h
+++ b/lldb/include/lldb/API/SBFunction.h
@@ -43,6 +43,8 @@ class LLDB_API SBFunction {
 
   lldb::SBAddress GetStartAddress();
 
+  LLDB_DEPRECATED_FIXME("Not compatible with discontinuous functions.",
+                        "GetRanges()")
   lldb::SBAddress GetEndAddress();
 
   lldb::SBAddressRangeList GetRanges();
diff --git a/lldb/include/lldb/Core/AddressRangeListImpl.h b/lldb/include/lldb/Core/AddressRangeListImpl.h
index 6742e6ead87de..6b88f9b1ac179 100644
--- a/lldb/include/lldb/Core/AddressRangeListImpl.h
+++ b/lldb/include/lldb/Core/AddressRangeListImpl.h
@@ -24,9 +24,8 @@ class AddressRangeListImpl {
 public:
   AddressRangeListImpl();
 
-  AddressRangeListImpl(const AddressRangeListImpl &rhs) = default;
-
-  AddressRangeListImpl &operator=(const AddressRangeListImpl &rhs);
+  explicit AddressRangeListImpl(AddressRanges ranges)
+      : m_ranges(std::move(ranges)) {}
 
   size_t GetSize() const;
 
diff --git a/lldb/include/lldb/DataFormatters/DumpValueObjectOptions.h b/lldb/include/lldb/DataFormatters/DumpValueObjectOptions.h
index c7f8cccc116c4..ce15963ab5662 100644
--- a/lldb/include/lldb/DataFormatters/DumpValueObjectOptions.h
+++ b/lldb/include/lldb/DataFormatters/DumpValueObjectOptions.h
@@ -22,13 +22,12 @@ namespace lldb_private {
 class DumpValueObjectOptions {
 public:
   struct PointerDepth {
-    enum class Mode { Always, Default, Never } m_mode;
-    uint32_t m_count;
+    uint32_t m_count = 0;
 
     PointerDepth Decremented() const {
       if (m_count > 0)
-        return PointerDepth{m_mode, m_count - 1};
-      return PointerDepth{m_mode, m_count};
+        return {m_count - 1};
+      return *this;
     }
 
     bool CanAllowExpansion() const;
@@ -65,8 +64,7 @@ class DumpValueObjectOptions {
 
   DumpValueObjectOptions(ValueObject &valobj);
 
-  DumpValueObjectOptions &
-  SetMaximumPointerDepth(PointerDepth depth = {PointerDepth::Mode::Never, 0});
+  DumpValueObjectOptions &SetMaximumPointerDepth(uint32_t depth);
 
   DumpValueObjectOptions &SetMaximumDepth(uint32_t depth, bool is_default);
 
diff --git a/lldb/include/lldb/Host/Socket.h b/lldb/include/lldb/Host/Socket.h
index e98797b36c8a5..4585eac12efb9 100644
--- a/lldb/include/lldb/Host/Socket.h
+++ b/lldb/include/lldb/Host/Socket.h
@@ -11,6 +11,7 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "lldb/Host/MainLoopBase.h"
 #include "lldb/Utility/Timeout.h"
@@ -151,6 +152,11 @@ class Socket : public IOObject {
   // If this Socket is connected then return the URI used to connect.
   virtual std::string GetRemoteConnectionURI() const { return ""; };
 
+  // If the Socket is listening then return the URI for clients to connect.
+  virtual std::vector<std::string> GetListeningConnectionURI() const {
+    return {};
+  }
+
 protected:
   Socket(SocketProtocol protocol, bool should_close);
 
diff --git a/lldb/include/lldb/Host/common/TCPSocket.h b/lldb/include/lldb/Host/common/TCPSocket.h
index ca36622691fe9..cb950c0015ea6 100644
--- a/lldb/include/lldb/Host/common/TCPSocket.h
+++ b/lldb/include/lldb/Host/common/TCPSocket.h
@@ -13,6 +13,8 @@
 #include "lldb/Host/Socket.h"
 #include "lldb/Host/SocketAddress.h"
 #include <map>
+#include <string>
+#include <vector>
 
 namespace lldb_private {
 class TCPSocket : public Socket {
@@ -52,6 +54,8 @@ class TCPSocket : public Socket {
 
   std::string GetRemoteConnectionURI() const override;
 
+  std::vector<std::string> GetListeningConnectionURI() const override;
+
 private:
   TCPSocket(NativeSocket socket, const TCPSocket &listen_socket);
 
diff --git a/lldb/include/lldb/Host/posix/DomainSocket.h b/lldb/include/lldb/Host/posix/DomainSocket.h
index d4e0d43ee169c..3dbe6206da2c5 100644
--- a/lldb/include/lldb/Host/posix/DomainSocket.h
+++ b/lldb/include/lldb/Host/posix/DomainSocket.h
@@ -10,6 +10,8 @@
 #define LLDB_HOST_POSIX_DOMAINSOCKET_H
 
 #include "lldb/Host/Socket.h"
+#include <string>
+#include <vector>
 
 namespace lldb_private {
 class DomainSocket : public Socket {
@@ -27,6 +29,8 @@ class DomainSocket : public Socket {
 
   std::string GetRemoteConnectionURI() const override;
 
+  std::vector<std::string> GetListeningConnectionURI() const override;
+
 protected:
   DomainSocket(SocketProtocol protocol);
 
diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h
index 70f51a846f8d9..e4118c1f9be86 100644
--- a/lldb/include/lldb/Symbol/Function.h
+++ b/lldb/include/lldb/Symbol/Function.h
@@ -444,8 +444,11 @@ class Function : public UserID, public SymbolContextScope {
 
   Function *CalculateSymbolContextFunction() override;
 
+  /// DEPRECATED: Use GetAddressRanges instead.
   const AddressRange &GetAddressRange() { return m_range; }
 
+  AddressRanges GetAddressRanges() { return m_block.GetRanges(); }
+
   lldb::LanguageType GetLanguage() const;
   /// Find the file and line number of the source location of the start of the
   /// function.  This will use the declaration if present and fall back on the
@@ -650,9 +653,6 @@ class Function : public UserID, public SymbolContextScope {
   /// All lexical blocks contained in this function.
   Block m_block;
 
-  /// List of address ranges belonging to the function.
-  AddressRanges m_ranges;
-
   /// The function address range that covers the widest range needed to contain
   /// all blocks. DEPRECATED: do not use this field in new code as the range may
   /// include addresses belonging to other functions.
diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index 920f80bc73317..f8a2cbf0d5d04 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -473,8 +473,6 @@ class Platform : public PluginInterface {
                       LLVM_PRETTY_FUNCTION, GetName()));
   }
 
-  const std::string &GetRemoteURL() const { return m_remote_url; }
-
   bool IsHost() const {
     return m_is_host; // Is this the default host platform?
   }
@@ -977,7 +975,6 @@ class Platform : public PluginInterface {
   std::string m_sdk_build;
   FileSpec m_working_dir; // The working directory which is used when installing
                           // modules that have no install path set
-  std::string m_remote_url;
   std::string m_hostname;
   llvm::VersionTuple m_os_version;
   ArchSpec
diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp
index ac61220ec8736..3f6b4eea98318 100644
--- a/lldb/source/API/SBFunction.cpp
+++ b/lldb/source/API/SBFunction.cpp
@@ -10,6 +10,7 @@
 #include "lldb/API/SBAddressRange.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
+#include "lldb/Core/AddressRangeListImpl.h"
 #include "lldb/Core/Disassembler.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Symbol/CompileUnit.h"
@@ -153,10 +154,11 @@ SBAddress SBFunction::GetEndAddress() {
 
   SBAddress addr;
   if (m_opaque_ptr) {
-    addr_t byte_size = m_opaque_ptr->GetAddressRange().GetByteSize();
-    if (byte_size > 0) {
-      addr.SetAddress(m_opaque_ptr->GetAddressRange().GetBaseAddress());
-      addr->Slide(byte_size);
+    AddressRanges ranges = m_opaque_ptr->GetAddressRanges();
+    if (!ranges.empty()) {
+      // Return the end of the first range, use GetRanges to get all ranges.
+      addr.SetAddress(ranges.front().GetBaseAddress());
+      addr->Slide(ranges.front().GetByteSize());
     }
   }
   return addr;
@@ -166,11 +168,8 @@ lldb::SBAddressRangeList SBFunction::GetRanges() {
   LLDB_INSTRUMENT_VA(this);
 
   lldb::SBAddressRangeList ranges;
-  if (m_opaque_ptr) {
-    lldb::SBAddressRange range;
-    (*range.m_opaque_up) = m_opaque_ptr->GetAddressRange();
-    ranges.Append(std::move(range));
-  }
+  if (m_opaque_ptr)
+    ranges.ref() = AddressRangeListImpl(m_opaque_ptr->GetAddressRanges());
 
   return ranges;
 }
diff --git a/lldb/source/Core/AddressRangeListImpl.cpp b/lldb/source/Core/AddressRangeListImpl.cpp
index d405cf0fa3ec3..257824a0551e1 100644
--- a/lldb/source/Core/AddressRangeListImpl.cpp
+++ b/lldb/source/Core/AddressRangeListImpl.cpp
@@ -13,14 +13,6 @@ using namespace lldb_private;
 
 AddressRangeListImpl::AddressRangeListImpl() : m_ranges() {}
 
-AddressRangeListImpl &
-AddressRangeListImpl::operator=(const AddressRangeListImpl &rhs) {
-  if (this == &rhs)
-    return *this;
-  m_ranges = rhs.m_ranges;
-  return *this;
-}
-
 size_t AddressRangeListImpl::GetSize() const { return m_ranges.size(); }
 
 void AddressRangeListImpl::Reserve(size_t capacity) {
diff --git a/lldb/source/DataFormatters/DumpValueObjectOptions.cpp b/lldb/source/DataFormatters/DumpValueObjectOptions.cpp
index 18d590d47d9a0..b952fb643f13e 100644
--- a/lldb/source/DataFormatters/DumpValueObjectOptions.cpp
+++ b/lldb/source/DataFormatters/DumpValueObjectOptions.cpp
@@ -14,10 +14,8 @@ using namespace lldb;
 using namespace lldb_private;
 
 DumpValueObjectOptions::DumpValueObjectOptions()
-    : m_summary_sp(), m_root_valobj_name(),
-      m_max_ptr_depth(PointerDepth{PointerDepth::Mode::Default, 0}),
-      m_decl_printing_helper(), m_child_printing_decider(),
-      m_pointer_as_array(), m_use_synthetic(true),
+    : m_summary_sp(), m_root_valobj_name(), m_decl_printing_helper(),
+      m_child_printing_decider(), m_pointer_as_array(), m_use_synthetic(true),
       m_scope_already_checked(false), m_flat_output(false), m_ignore_cap(false),
       m_show_types(false), m_show_location(false), m_use_objc(false),
       m_hide_root_type(false), m_hide_root_name(false), m_hide_name(false),
@@ -33,8 +31,8 @@ DumpValueObjectOptions::DumpValueObjectOptions(ValueObject &valobj)
 }
 
 DumpValueObjectOptions &
-DumpValueObjectOptions::SetMaximumPointerDepth(PointerDepth depth) {
-  m_max_ptr_depth = depth;
+DumpValueObjectOptions::SetMaximumPointerDepth(uint32_t depth) {
+  m_max_ptr_depth = {depth};
   return *this;
 }
 
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index face38253efab..01e604e019f25 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -503,14 +503,7 @@ ValueObjectPrinter::PrintObjectDescriptionIfNeeded(bool value_printed,
 }
 
 bool DumpValueObjectOptions::PointerDepth::CanAllowExpansion() const {
-  switch (m_mode) {
-  case Mode::Always:
-  case Mode::Default:
-    return m_count > 0;
-  case Mode::Never:
-    return false;
-  }
-  return false;
+  return m_count > 0;
 }
 
 bool ValueObjectPrinter::ShouldPrintChildren(
diff --git a/lldb/source/Host/common/TCPSocket.cpp b/lldb/source/Host/common/TCPSocket.cpp
index 5d863954ee886..d0055c3b6c44f 100644
--- a/lldb/source/Host/common/TCPSocket.cpp
+++ b/lldb/source/Host/common/TCPSocket.cpp
@@ -115,6 +115,14 @@ std::string TCPSocket::GetRemoteConnectionURI() const {
   return "";
 }
 
+std::vector<std::string> TCPSocket::GetListeningConnectionURI() const {
+  std::vector<std::string> URIs;
+  for (const auto &[fd, addr] : m_listen_sockets)
+    URIs.emplace_back(llvm::formatv("connection://[{0}]:{1}",
+                                    addr.GetIPAddress(), addr.GetPort()));
+  return URIs;
+}
+
 Status TCPSocket::CreateSocket(int domain) {
   Status error;
   if (IsValid())
diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index 0451834630d33..9a0b385d998bf 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -175,3 +175,17 @@ std::string DomainSocket::GetRemoteConnectionURI() const {
       "{0}://{1}",
       GetNameOffset() == 0 ? "unix-connect" : "unix-abstract-connect", name);
 }
+
+std::vector<std::string> DomainSocket::GetListeningConnectionURI() const {
+  if (m_socket == kInvalidSocketValue)
+    return {};
+
+  struct sockaddr_un addr;
+  bzero(&addr, sizeof(struct sockaddr_un));
+  addr.sun_family = AF_UNIX;
+  socklen_t addr_len = sizeof(struct sockaddr_un);
+  if (::getsockname(m_socket, (struct sockaddr *)&addr, &addr_len) != 0)
+    return {};
+
+  return {llvm::formatv("unix-connect://{0}", addr.sun_path)};
+}
diff --git a/lldb/source/Interpreter/OptionGroupValueObjectDisplay.cpp b/lldb/source/Interpreter/OptionGroupValueObjectDisplay.cpp
index 0e8c1f4b5f1d9..d633c469e603e 100644
--- a/lldb/source/Interpreter/OptionGroupValueObjectDisplay.cpp
+++ b/lldb/source/Interpreter/OptionGroupValueObjectDisplay.cpp
@@ -190,8 +190,7 @@ DumpValueObjectOptions OptionGroupValueObjectDisplay::GetAsDumpOptions(
     LanguageRuntimeDescriptionDisplayVerbosity lang_descr_verbosity,
     lldb::Format format, lldb::TypeSummaryImplSP summary_sp) {
   DumpValueObjectOptions options;
-  options.SetMaximumPointerDepth(
-      {DumpValueObjectOptions::PointerDepth::Mode::Always, ptr_depth});
+  options.SetMaximumPointerDepth(ptr_depth);
   if (use_objc)
     options.SetShowSummary(false);
   else
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 4aa85a99edf01..daffa1379fe57 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -3775,6 +3775,7 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       SymbolType type = eSymbolTypeInvalid;
       SectionSP symbol_section;
+      lldb::addr_t symbol_byte_size = 0;
       bool add_nlist = true;
       bool is_gsym = false;
       bool demangled_is_synthesized = false;
@@ -4360,6 +4361,47 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       if (symbol_section) {
         const addr_t section_file_addr = symbol_section->GetFileAddress();
+        if (symbol_byte_size == 0 && function_starts_count > 0) {
+          addr_t symbol_lookup_file_addr = nlist.n_value;
+          // Do an exact address match for non-ARM addresses, else get the
+          // closest since the symbol might be a thumb symbol which has an
+          // address with bit zero set.
+          FunctionStarts::Entry *func_start_entry =
+              function_starts.FindEntry(symbol_lookup_file_addr, !is_arm);
+          if (is_arm && func_start_entry) {
+            // Verify that the function start address is the symbol address
+            // (ARM) or the symbol address + 1 (thumb).
+            if (func_start_entry->addr != symbol_lookup_file_addr &&
+                func_start_entry->addr != (symbol_lookup_file_addr + 1)) {
+              // Not the right entry, NULL it out...
+              func_start_entry = nullptr;
+            }
+          }
+          if (func_start_entry) {
+            func_start_entry->data = true;
+
+            addr_t symbol_file_addr = func_start_entry->addr;
+            if (is_arm)
+              symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
+
+            const FunctionStarts::Entry *next_func_start_entry =
+                function_starts.FindNextEntry(func_start_entry);
+            const addr_t section_end_file_addr =
+                section_file_addr + symbol_section->GetByteSize();
+            if (next_func_start_entry) {
+              addr_t next_symbol_file_addr = next_func_start_entry->addr;
+              // Be sure the clear the Thumb address bit when we calculate the
+              // size from the current and next address
+              if (is_arm)
+                next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
+              symbol_byte_size = std::min<lldb::addr_t>(
+                  next_symbol_file_addr - symbol_file_addr,
+                  section_end_file_addr - symbol_file_addr);
+            } else {
+              symbol_byte_size = section_end_file_addr - symbol_file_addr;
+            }
+          }
+        }
         symbol_value -= section_file_addr;
       }
 
@@ -4466,6 +4508,9 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
       if (nlist.n_desc & N_WEAK_REF)
         sym[sym_idx].SetIsWeak(true);
 
+      if (symbol_byte_size > 0)
+        sym[sym_idx].SetByteSize(symbol_byte_size);
+
       if (demangled_is_synthesized)
         sym[sym_idx].SetDemangledNameIsSynthesized(true);
 
@@ -4584,7 +4629,23 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           Address symbol_addr;
           if (module_sp->ResolveFileAddress(symbol_file_addr, symbol_addr)) {
             SectionSP symbol_section(symbol_addr.GetSection());
+            uint32_t symbol_byte_size = 0;
             if (symbol_section) {
+              const addr_t section_file_addr = symbol_section->GetFileAddress();
+              const FunctionStarts::Entry *next_func_start_entry =
+                  function_starts.FindNextEntry(func_start_entry);
+              const addr_t section_end_file_addr =
+                  section_file_addr + symbol_section->GetByteSize();
+              if (next_func_start_entry) {
+                addr_t next_symbol_file_addr = next_func_start_entry->addr;
+                if (is_arm)
+                  next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
+                symbol_byte_size = std::min<lldb::addr_t>(
+                    next_symbol_file_addr - symbol_file_addr,
+                    section_end_file_addr - symbol_file_addr);
+              } else {
+                symbol_byte_size = section_end_file_addr - symbol_file_addr;
+              }
               sym[sym_idx].SetID(synthetic_sym_id++);
               // Don't set the name for any synthetic symbols, the Symbol
               // object will generate one if needed when the name is accessed
@@ -4596,6 +4657,8 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
               add_symbol_addr(symbol_addr.GetFileAddress());
               if (symbol_flags)
                 sym[sym_idx].SetFlags(symbol_flags);
+              if (symbol_byte_size)
+                sym[sym_idx].SetByteSize(symbol_byte_size);
               ++sym_idx;
             }
           }
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
index df3bf157278da..bc886259d6fa5 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
@@ -299,9 +299,7 @@ size_t SymbolFileBreakpad::ParseBlocksRecursive(Function &func) {
   // "INLINE 0 ...", the current level is 0 and its parent block is the
   // function block at index 0.
   std::vector<Block *> blocks;
-  Block &block = func.GetBlock(false);
-  block.AddRange(Block::Range(0, func.GetAddressRange().GetByteSize()));
-  blocks.push_back(&block);
+  blocks.push_back(&func.GetBlock(false));
 
   size_t blocks_added = 0;
   addr_t func_base = func.GetAddressRange().GetBaseAddress().GetOffset();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 6f2cb455ec00e..c71c2dd47344a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -527,7 +527,7 @@ void DebugNamesDWARFIndex::GetTypesWithQuery(
   ConstString name = query.GetTypeBasename();
   std::vector<lldb_private::CompilerContext> query_context =
       query.GetContextRef();
-  if (query_context.size() <= 1)
+  if (query_context.size() <= 1 && !query.GetExactMatch())
     return GetTypes(name, callback);
 
   llvm::SmallVector<CompilerContext> parent_contexts =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index 1220e6115a2a9..0be19ab29ef08 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -706,6 +706,11 @@ bool ManualDWARFIndex::Encode(DataEncoder &encoder) const {
   return true;
 }
 
+bool ManualDWARFIndex::IsPartial() const {
+  // If we have units or type units to skip, then this index is partial.
+  return !m_units_to_avoid.empty() || !m_type_sigs_to_avoid.empty();
+}
+
 std::string ManualDWARFIndex::GetCacheKey() {
   std::string key;
   llvm::raw_string_ostream strm(key);
@@ -713,9 +718,26 @@ std::string ManualDWARFIndex::GetCacheKey() {
   // module can have one object file as the main executable and might have
   // another object file in a separate symbol file, or we might have a .dwo file
   // that claims its module is the main executable.
+
+  // This class can be used to index all of the DWARF, or part of the DWARF
+  // when there is a .debug_names index where some compile or type units were
+  // built without .debug_names. So we need to know when we have a full manual
+  // DWARF index or a partial manual DWARF index and save them to different
+  // cache files. Before this fix we might end up debugging a binary with
+  // .debug_names where some of the compile or type units weren't indexed, and
+  // find an issue with the .debug_names tables (bugs or being incomplete), and
+  // then we disable loading the .debug_names by setting a setting in LLDB by
+  // running "settings set plugin.symbol-file.dwarf.ignore-file-indexes 0" in
+  // another LLDB instance. The problem arose when there was an index cache from
+  // a previous run where .debug_names was enabled and it had saved a cache file
+  // that only covered the missing compile and type units from the .debug_names,
+  // and with the setting that disables the loading of the cache files we would
+  // load partial cache index cache. So we need to pick a unique cache suffix
+  // name that indicates if the cache is partial or full to avoid this problem.
+  llvm::StringRef dwarf_index_suffix(IsPartial() ? "partial-" : "full-");
   ObjectFile *objfile = m_dwarf->GetObjectFile();
   strm << objfile->GetModule()->GetCacheKey() << "-dwarf-index-"
-      << llvm::format_hex(objfile->GetCacheHash(), 10);
+       << dwarf_index_suffix << llvm::format_hex(objfile->GetCacheHash(), 10);
   return key;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
index d8c4a22ab21f7..6a52c88a99220 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -168,6 +168,16 @@ class ManualDWARFIndex : public DWARFIndex {
                             const lldb::LanguageType cu_language,
                             IndexSet &set);
 
+  /// Return true if this manual DWARF index is covering only part of the DWARF.
+  ///
+  /// An instance of this class will be used to index all of the DWARF, but also
+  /// when we have .debug_names we will use one to index any compile or type
+  /// units that are not covered by the .debug_names table.
+  ///
+  /// \return
+  ///   True if this index is a partial index, false otherwise.
+  bool IsPartial() const;
+
   /// The DWARF file which we are indexing.
   SymbolFileDWARF *m_dwarf;
   /// Which dwarf units should we skip while building the index.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index fe711c56958c4..6f19b264eb3dd 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1305,121 +1305,76 @@ bool SymbolFileDWARF::ParseDebugMacros(CompileUnit &comp_unit) {
   return true;
 }
 
-size_t SymbolFileDWARF::ParseBlocksRecursive(
-    lldb_private::CompileUnit &comp_unit, Block *parent_block,
-    const DWARFDIE &orig_die, addr_t subprogram_low_pc, uint32_t depth) {
+size_t SymbolFileDWARF::ParseBlocksRecursive(CompileUnit &comp_unit,
+                                             Block *parent_block, DWARFDIE die,
+                                             addr_t subprogram_low_pc) {
   size_t blocks_added = 0;
-  DWARFDIE die = orig_die;
-  while (die) {
+  for (; die; die = die.GetSibling()) {
     dw_tag_t tag = die.Tag();
 
-    switch (tag) {
-    case DW_TAG_inlined_subroutine:
-    case DW_TAG_subprogram:
-    case DW_TAG_lexical_block: {
-      Block *block = nullptr;
-      if (tag == DW_TAG_subprogram) {
-        // Skip any DW_TAG_subprogram DIEs that are inside of a normal or
-        // inlined functions. These will be parsed on their own as separate
-        // entities.
-
-        if (depth > 0)
-          break;
+    if (tag != DW_TAG_inlined_subroutine && tag != DW_TAG_lexical_block)
+      continue;
 
-        block = parent_block;
-      } else {
-        block = parent_block->CreateChild(die.GetID()).get();
-      }
-      DWARFRangeList ranges;
-      const char *name = nullptr;
-      const char *mangled_name = nullptr;
-
-      std::optional<int> decl_file;
-      std::optional<int> decl_line;
-      std::optional<int> decl_column;
-      std::optional<int> call_file;
-      std::optional<int> call_line;
-      std::optional<int> call_column;
-      if (die.GetDIENamesAndRanges(name, mangled_name, ranges, decl_file,
-                                   decl_line, decl_column, call_file, call_line,
-                                   call_column, nullptr)) {
-        if (tag == DW_TAG_subprogram) {
-          assert(subprogram_low_pc == LLDB_INVALID_ADDRESS);
-          subprogram_low_pc = ranges.GetMinRangeBase(0);
-        } else if (tag == DW_TAG_inlined_subroutine) {
-          // We get called here for inlined subroutines in two ways. The first
-          // time is when we are making the Function object for this inlined
-          // concrete instance.  Since we're creating a top level block at
-          // here, the subprogram_low_pc will be LLDB_INVALID_ADDRESS.  So we
-          // need to adjust the containing address. The second time is when we
-          // are parsing the blocks inside the function that contains the
-          // inlined concrete instance.  Since these will be blocks inside the
-          // containing "real" function the offset will be for that function.
-          if (subprogram_low_pc == LLDB_INVALID_ADDRESS) {
-            subprogram_low_pc = ranges.GetMinRangeBase(0);
-          }
-        }
-
-        const size_t num_ranges = ranges.GetSize();
-        for (size_t i = 0; i < num_ranges; ++i) {
-          const DWARFRangeList::Entry &range = ranges.GetEntryRef(i);
-          const addr_t range_base = range.GetRangeBase();
-          if (range_base >= subprogram_low_pc)
-            block->AddRange(Block::Range(range_base - subprogram_low_pc,
-                                         range.GetByteSize()));
-          else {
-            GetObjectFile()->GetModule()->ReportError(
-                "{0:x8}: adding range [{1:x16}-{2:x16}) which has a base "
-                "that is less than the function's low PC {3:x16}. Please file "
-                "a bug and attach the file at the "
-                "start of this error message",
-                block->GetID(), range_base, range.GetRangeEnd(),
-                subprogram_low_pc);
-          }
-        }
-        block->FinalizeRanges();
-
-        if (tag != DW_TAG_subprogram &&
-            (name != nullptr || mangled_name != nullptr)) {
-          std::unique_ptr<Declaration> decl_up;
-          if (decl_file || decl_line || decl_column)
-            decl_up = std::make_unique<Declaration>(
-                comp_unit.GetSupportFiles().GetFileSpecAtIndex(
-                    decl_file ? *decl_file : 0),
-                decl_line ? *decl_line : 0, decl_column ? *decl_column : 0);
-
-          std::unique_ptr<Declaration> call_up;
-          if (call_file || call_line || call_column)
-            call_up = std::make_unique<Declaration>(
-                comp_unit.GetSupportFiles().GetFileSpecAtIndex(
-                    call_file ? *call_file : 0),
-                call_line ? *call_line : 0, call_column ? *call_column : 0);
-
-          block->SetInlinedFunctionInfo(name, mangled_name, decl_up.get(),
-                                        call_up.get());
+    Block *block = parent_block->CreateChild(die.GetID()).get();
+    DWARFRangeList ranges;
+    const char *name = nullptr;
+    const char *mangled_name = nullptr;
+
+    std::optional<int> decl_file;
+    std::optional<int> decl_line;
+    std::optional<int> decl_column;
+    std::optional<int> call_file;
+    std::optional<int> call_line;
+    std::optional<int> call_column;
+    if (die.GetDIENamesAndRanges(name, mangled_name, ranges, decl_file,
+                                 decl_line, decl_column, call_file, call_line,
+                                 call_column, nullptr)) {
+      const size_t num_ranges = ranges.GetSize();
+      for (size_t i = 0; i < num_ranges; ++i) {
+        const DWARFRangeList::Entry &range = ranges.GetEntryRef(i);
+        const addr_t range_base = range.GetRangeBase();
+        if (range_base >= subprogram_low_pc)
+          block->AddRange(Block::Range(range_base - subprogram_low_pc,
+                                       range.GetByteSize()));
+        else {
+          GetObjectFile()->GetModule()->ReportError(
+              "{0:x8}: adding range [{1:x16}-{2:x16}) which has a base "
+              "that is less than the function's low PC {3:x16}. Please file "
+              "a bug and attach the file at the "
+              "start of this error message",
+              block->GetID(), range_base, range.GetRangeEnd(),
+              subprogram_low_pc);
         }
+      }
+      block->FinalizeRanges();
+
+      if (tag != DW_TAG_subprogram &&
+          (name != nullptr || mangled_name != nullptr)) {
+        std::unique_ptr<Declaration> decl_up;
+        if (decl_file || decl_line || decl_column)
+          decl_up = std::make_unique<Declaration>(
+              comp_unit.GetSupportFiles().GetFileSpecAtIndex(
+                  decl_file ? *decl_file : 0),
+              decl_line ? *decl_line : 0, decl_column ? *decl_column : 0);
+
+        std::unique_ptr<Declaration> call_up;
+        if (call_file || call_line || call_column)
+          call_up = std::make_unique<Declaration>(
+              comp_unit.GetSupportFiles().GetFileSpecAtIndex(
+                  call_file ? *call_file : 0),
+              call_line ? *call_line : 0, call_column ? *call_column : 0);
+
+        block->SetInlinedFunctionInfo(name, mangled_name, decl_up.get(),
+                                      call_up.get());
+      }
 
-        ++blocks_added;
+      ++blocks_added;
 
-        if (die.HasChildren()) {
-          blocks_added +=
-              ParseBlocksRecursive(comp_unit, block, die.GetFirstChild(),
-                                   subprogram_low_pc, depth + 1);
-        }
+      if (die.HasChildren()) {
+        blocks_added += ParseBlocksRecursive(
+            comp_unit, block, die.GetFirstChild(), subprogram_low_pc);
       }
-    } break;
-    default:
-      break;
     }
-
-    // Only parse siblings of the block if we are not at depth zero. A depth of
-    // zero indicates we are currently parsing the top level DW_TAG_subprogram
-    // DIE
-
-    if (depth == 0)
-      die.Clear();
-    else
-      die = die.GetSibling();
   }
   return blocks_added;
 }
@@ -3240,8 +3195,16 @@ size_t SymbolFileDWARF::ParseBlocksRecursive(Function &func) {
   DWARFDIE function_die =
       dwarf_cu->GetNonSkeletonUnit().GetDIE(function_die_offset);
   if (function_die) {
-    ParseBlocksRecursive(*comp_unit, &func.GetBlock(false), function_die,
-                         LLDB_INVALID_ADDRESS, 0);
+    // We can't use the file address from the Function object as (in the OSO
+    // case) it will already be remapped to the main module.
+    DWARFRangeList ranges = function_die.GetDIE()->GetAttributeAddressRanges(
+        function_die.GetCU(),
+        /*check_hi_lo_pc=*/true);
+    lldb::addr_t function_file_addr =
+        ranges.GetMinRangeBase(LLDB_INVALID_ADDRESS);
+    if (function_file_addr != LLDB_INVALID_ADDRESS)
+      ParseBlocksRecursive(*comp_unit, &func.GetBlock(false),
+                           function_die.GetFirstChild(), function_file_addr);
   }
 
   return functions_added;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index ac25a0c48ee7d..76f4188fdf4af 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -395,8 +395,7 @@ class SymbolFileDWARF : public SymbolFileCommon {
   Function *ParseFunction(CompileUnit &comp_unit, const DWARFDIE &die);
 
   size_t ParseBlocksRecursive(CompileUnit &comp_unit, Block *parent_block,
-                              const DWARFDIE &die,
-                              lldb::addr_t subprogram_low_pc, uint32_t depth);
+                              DWARFDIE die, lldb::addr_t subprogram_low_pc);
 
   size_t ParseTypes(const SymbolContext &sc, const DWARFDIE &die,
                     bool parse_siblings, bool parse_children);
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index d17fedf26b4c4..27d51bbd1cb56 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -394,18 +394,12 @@ Block *SymbolFileNativePDB::CreateBlock(PdbCompilandSymId block_id) {
 
   switch (sym.kind()) {
   case S_GPROC32:
-  case S_LPROC32: {
+  case S_LPROC32:
     // This is a function.  It must be global.  Creating the Function entry
     // for it automatically creates a block for it.
-    FunctionSP func = GetOrCreateFunction(block_id, *comp_unit);
-    if (func) {
-      Block &block = func->GetBlock(false);
-      if (block.GetNumRanges() == 0)
-        block.AddRange(Block::Range(0, func->GetAddressRange().GetByteSize()));
-      return &block;
-    }
+    if (FunctionSP func = GetOrCreateFunction(block_id, *comp_unit))
+      return &func->GetBlock(false);
     break;
-  }
   case S_BLOCK32: {
     // This is a block.  Its parent is either a function or another block.  In
     // either case, its parent can be viewed as a block (e.g. a function
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 4935b0fbdfd87..b7854c05d345a 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -402,44 +402,32 @@ static size_t ParseFunctionBlocksForPDBSymbol(
   assert(pdb_symbol && parent_block);
 
   size_t num_added = 0;
-  switch (pdb_symbol->getSymTag()) {
-  case PDB_SymType::Block:
-  case PDB_SymType::Function: {
-    Block *block = nullptr;
-    auto &raw_sym = pdb_symbol->getRawSymbol();
-    if (auto *pdb_func = llvm::dyn_cast<PDBSymbolFunc>(pdb_symbol)) {
-      if (pdb_func->hasNoInlineAttribute())
-        break;
-      if (is_top_parent)
-        block = parent_block;
-      else
-        break;
-    } else if (llvm::isa<PDBSymbolBlock>(pdb_symbol)) {
-      auto uid = pdb_symbol->getSymIndexId();
-      if (parent_block->FindBlockByID(uid))
-        break;
-      if (raw_sym.getVirtualAddress() < func_file_vm_addr)
-        break;
 
-      block = parent_block->CreateChild(pdb_symbol->getSymIndexId()).get();
-    } else
-      llvm_unreachable("Unexpected PDB symbol!");
+  if (!is_top_parent) {
+    // Ranges for the top block were parsed together with the function.
+    if (pdb_symbol->getSymTag() != PDB_SymType::Block)
+      return num_added;
 
+    auto &raw_sym = pdb_symbol->getRawSymbol();
+    assert(llvm::isa<PDBSymbolBlock>(pdb_symbol));
+    auto uid = pdb_symbol->getSymIndexId();
+    if (parent_block->FindBlockByID(uid))
+      return num_added;
+    if (raw_sym.getVirtualAddress() < func_file_vm_addr)
+      return num_added;
+
+    Block *block = parent_block->CreateChild(pdb_symbol->getSymIndexId()).get();
     block->AddRange(Block::Range(
         raw_sym.getVirtualAddress() - func_file_vm_addr, raw_sym.getLength()));
     block->FinalizeRanges();
-    ++num_added;
+  }
+  auto results_up = pdb_symbol->findAllChildren();
+  if (!results_up)
+    return num_added;
 
-    auto results_up = pdb_symbol->findAllChildren();
-    if (!results_up)
-      break;
-    while (auto symbol_up = results_up->getNext()) {
-      num_added += ParseFunctionBlocksForPDBSymbol(
-          func_file_vm_addr, symbol_up.get(), block, false);
-    }
-  } break;
-  default:
-    break;
+  while (auto symbol_up = results_up->getNext()) {
+    num_added += ParseFunctionBlocksForPDBSymbol(
+        func_file_vm_addr, symbol_up.get(), parent_block, false);
   }
   return num_added;
 }
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index b346749ca06ec..4f07b946353a4 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -279,9 +279,14 @@ Function::Function(CompileUnit *comp_unit, lldb::user_id_t func_uid,
                    AddressRanges ranges)
     : UserID(func_uid), m_comp_unit(comp_unit), m_type_uid(type_uid),
       m_type(type), m_mangled(mangled), m_block(*this, func_uid),
-      m_ranges(std::move(ranges)), m_range(CollapseRanges(m_ranges)),
-      m_frame_base(), m_flags(), m_prologue_byte_size(0) {
+      m_range(CollapseRanges(ranges)), m_prologue_byte_size(0) {
   assert(comp_unit != nullptr);
+  lldb::addr_t base_file_addr = m_range.GetBaseAddress().GetFileAddress();
+  for (const AddressRange &range : ranges)
+    m_block.AddRange(
+        Block::Range(range.GetBaseAddress().GetFileAddress() - base_file_addr,
+                     range.GetByteSize()));
+  m_block.FinalizeRanges();
 }
 
 Function::~Function() = default;
@@ -426,13 +431,16 @@ void Function::GetDescription(Stream *s, lldb::DescriptionLevel level,
     llvm::interleaveComma(decl_context, *s, [&](auto &ctx) { ctx.Dump(*s); });
     *s << "}";
   }
-  *s << ", range" << (m_ranges.size() > 1 ? "s" : "") << " = ";
+  *s << ", range" << (m_block.GetNumRanges() > 1 ? "s" : "") << " = ";
   Address::DumpStyle fallback_style =
       level == eDescriptionLevelVerbose
           ? Address::DumpStyleModuleWithFileAddress
           : Address::DumpStyleFileAddress;
-  for (const AddressRange &range : m_ranges)
+  for (unsigned idx = 0; idx < m_block.GetNumRanges(); ++idx) {
+    AddressRange range;
+    m_block.GetRangeAtIndex(idx, range);
     range.Dump(s, target, Address::DumpStyleLoadAddress, fallback_style);
+  }
 }
 
 void Function::Dump(Stream *s, bool show_context) const {
diff --git a/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py
index 04e807c5c6201..1ef37d2ec9898 100644
--- a/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py
+++ b/lldb/test/API/python_api/find_in_memory/TestFindInMemory.py
@@ -152,3 +152,16 @@ def test_find_in_memory_unaligned(self):
         )
         self.assertSuccess(error)
         self.assertEqual(addr, lldb.LLDB_INVALID_ADDRESS)
+
+    def test_memory_info_list_iterable(self):
+        """Make sure the SBMemoryRegionInfoList is iterable"""
+        self.assertTrue(self.process, PROCESS_IS_VALID)
+        self.assertState(self.process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        info_list = self.process.GetMemoryRegions()
+        self.assertTrue(info_list.GetSize() > 0)
+        try:
+            for info in info_list:
+                pass
+        except Exception:
+            self.fail("SBMemoryRegionInfoList is not iterable")
diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s
new file mode 100644
index 0000000000000..a9e4104f2aaf7
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s
@@ -0,0 +1,182 @@
+# REQUIRES: x86
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -triple x86_64-pc-linux -filetype=obj %t/input.s -o %t/input.o
+# RUN: %lldb %t/input.o -o "command script import %t/script.py" -o exit | FileCheck %s
+
+# CHECK: Found 1 function(s).
+# CHECK: foo: [input.o[0x0-0xe), input.o[0x14-0x1c)]
+
+#--- script.py
+import lldb
+
+def __lldb_init_module(debugger, internal_dict):
+  target = debugger.GetSelectedTarget()
+  sym_ctxs = target.FindFunctions("foo")
+  print(f"Found {len(sym_ctxs)} function(s).")
+  for ctx in sym_ctxs:
+    fn = ctx.function
+    print(f"{fn.name}: {fn.GetRanges()}")
+
+#--- input.s
+# An example of a function which has been split into two parts. Roughly
+# corresponds to this C code.
+# int baz();
+# int bar() { return 47; }
+# int foo(int flag) { return flag ? bar() : baz(); }
+# The function bar has been placed "in the middle" of foo.
+
+        .text
+
+        .type   foo,@function
+foo:
+        .cfi_startproc
+        cmpl    $0, %edi
+        je      foo.__part.2
+        jmp     foo.__part.1
+        .cfi_endproc
+.Lfoo_end:
+        .size   foo, .Lfoo_end-foo
+
+foo.__part.1:
+        .cfi_startproc
+        callq   bar
+        jmp     foo.__part.3
+.Lfoo.__part.1_end:
+        .size   foo.__part.1, .Lfoo.__part.1_end-foo.__part.1
+        .cfi_endproc
+
+bar:
+        .cfi_startproc
+        movl    $47, %eax
+        retq
+        .cfi_endproc
+.Lbar_end:
+        .size   bar, .Lbar_end-bar
+
+foo.__part.2:
+        .cfi_startproc
+        callq   baz
+        jmp     foo.__part.3
+.Lfoo.__part.2_end:
+        .size   foo.__part.2, .Lfoo.__part.2_end-foo.__part.2
+        .cfi_endproc
+
+foo.__part.3:
+        .cfi_startproc
+        retq
+.Lfoo.__part.3_end:
+        .size   foo.__part.3, .Lfoo.__part.3_end-foo.__part.3
+        .cfi_endproc
+
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   37                              # DW_AT_producer
+        .byte   8                               # DW_FORM_string
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   85                              # DW_AT_ranges
+        .byte   35                              # DW_FORM_rnglistx
+        .byte   116                             # DW_AT_rnglists_base
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   2                               # Abbreviation Code
+        .byte   46                              # DW_TAG_subprogram
+        .byte   0                               # DW_CHILDREN_no
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   18                              # DW_AT_high_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   3                               # DW_AT_name
+        .byte   8                               # DW_FORM_string
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   3                               # Abbreviation Code
+        .byte   46                              # DW_TAG_subprogram
+        .byte   0                               # DW_CHILDREN_no
+        .byte   85                              # DW_AT_ranges
+        .byte   35                              # DW_FORM_rnglistx
+        .byte   64                              # DW_AT_frame_base
+        .byte   24                              # DW_FORM_exprloc
+        .byte   3                               # DW_AT_name
+        .byte   8                               # DW_FORM_string
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                               # DWARF version number
+        .byte   1                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   1                               # Abbrev [1] DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .short  29                              # DW_AT_language
+        .quad   0                               # DW_AT_low_pc
+        .byte   1                               # DW_AT_ranges
+        .long   .Lrnglists_table_base0          # DW_AT_rnglists_base
+        .byte   2                               # Abbrev [2] DW_TAG_subprogram
+        .quad   bar                             # DW_AT_low_pc
+        .quad   .Lbar_end                       # DW_AT_high_pc
+        .asciz  "bar"                           # DW_AT_name
+        .byte   3                               # Abbrev [3] DW_TAG_subprogram
+        .byte   0                               # DW_AT_ranges
+        .byte   1                               # DW_AT_frame_base
+        .byte   86
+        .asciz  "foo"                           # DW_AT_name
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+        .section        .debug_rnglists,"",@progbits
+        .long   .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+        .short  5                               # Version
+        .byte   8                               # Address size
+        .byte   0                               # Segment selector size
+        .long   2                               # Offset entry count
+.Lrnglists_table_base0:
+        .long   .Ldebug_ranges0-.Lrnglists_table_base0
+        .long   .Ldebug_ranges1-.Lrnglists_table_base0
+.Ldebug_ranges0:
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo
+        .quad   .Lfoo_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.1
+        .quad   .Lfoo.__part.1_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.2
+        .quad   .Lfoo.__part.2_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.3
+        .quad   .Lfoo.__part.3_end
+        .byte   0                               # DW_RLE_end_of_list
+.Ldebug_ranges1:
+        .byte   6                               # DW_RLE_start_end
+        .quad   bar
+        .quad   .Lbar_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.1
+        .quad   .Lfoo.__part.1_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.2
+        .quad   .Lfoo.__part.2_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo.__part.3
+        .quad   .Lfoo.__part.3_end
+        .byte   6                               # DW_RLE_start_end
+        .quad   foo
+        .quad   .Lfoo_end
+        .byte   0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
+
+        .section        ".note.GNU-stack","",@progbits
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s b/lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
index 2584158207cc8..b03d5d12ad2a1 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/discontinuous-function.s
@@ -10,7 +10,7 @@
 
 # CHECK: 1 match found in {{.*}}
 # CHECK: Summary: {{.*}}`foo
-# CHECK: Function: id = {{.*}}, name = "foo", ranges = [0x0000000000000000-0x0000000000000007)[0x0000000000000007-0x000000000000000e)[0x0000000000000014-0x000000000000001b)[0x000000000000001b-0x000000000000001c)
+# CHECK: Function: id = {{.*}}, name = "foo", ranges = [0x0000000000000000-0x000000000000000e)[0x0000000000000014-0x000000000000001c)
 
         .text
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
new file mode 100644
index 0000000000000..3e97c3fb1ebc2
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/dwp-index-cache.cpp
@@ -0,0 +1,62 @@
+// REQUIRES: lld
+
+// Test if we build a mixed binary where one .o file has a .debug_names and
+// another doesn't have one, that we save a full or partial index cache.
+// Previous versions of LLDB would have ManualDWARFIndex.cpp that would save out
+// an index cache to the same file regardless of wether the index cache was a
+// full DWARF manual index, or just the CUs and TUs that were missing from any
+// .debug_names tables. If the user had a .debug_names table and debugged once
+// with index caching enabled, then debugged again but set the setting to ignore
+// .debug_names ('settings set plugin.symbol-file.dwarf.ignore-file-indexes 1')
+// this could cause LLDB to load the index cache from the previous run which
+// was incomplete and it only contained the manually indexed DWARF from the run
+// where we used .debug_names, but it would now load it as if it were the
+// complete DWARF index.
+
+// Test that if we don't have .debug_names, that we save a full DWARF index.
+// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o
+// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: ld.lld %t.main.o %t.foo.o -o %t.nonames
+// RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.nonames.dwp
+// RUN: rm %t.main.dwo %t.foo.dwo
+// Run one time with the index cache enabled to populate the index cache. When
+// we populate the index cache we have to parse all of the DWARF debug info
+// and it is always available.
+// RUN: rm -rf %t.lldb-index-cache
+// RUN: %lldb \
+// RUN:   -O 'settings set symbols.enable-lldb-index-cache true' \
+// RUN:   -O 'settings set symbols.lldb-index-cache-path %t.lldb-index-cache' \
+// RUN:   -O 'settings set target.preload-symbols true' \
+// RUN:   %t.nonames -b
+
+// Make sure there is a file with "dwarf-index-full" in its filename
+// RUN: ls %t.lldb-index-cache | FileCheck %s -check-prefix=FULL
+// FULL: {{dwp-index-cache.cpp.tmp.nonames.*-dwarf-index-full-}}
+
+// Test that if we have one .o file with .debug_names and one without, that we
+// save a partial DWARF index.
+// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=1 -c %s -o %t.main.o -gpubnames
+// RUN: %clang -target x86_64-pc-linux -gsplit-dwarf -gdwarf-5 -DMAIN=0 -c %s -o %t.foo.o
+// RUN: ld.lld %t.main.o %t.foo.o -o %t.somenames
+// RUN: llvm-dwp %t.main.dwo %t.foo.dwo -o %t.somenames.dwp
+// RUN: rm %t.main.dwo %t.foo.dwo
+// Run one time with the index cache enabled to populate the index cache. When
+// we populate the index cache we have to parse all of the DWARF debug info
+// and it is always available.
+// RUN: rm -rf %t.lldb-index-cache
+// RUN: %lldb \
+// RUN:   -O 'settings set symbols.enable-lldb-index-cache true' \
+// RUN:   -O 'settings set symbols.lldb-index-cache-path %t.lldb-index-cache' \
+// RUN:   -O 'settings set target.preload-symbols true' \
+// RUN:   %t.somenames -b
+
+// Make sure there is a file with "dwarf-index-full" in its filename
+// RUN: ls %t.lldb-index-cache | FileCheck %s -check-prefix=PARTIAL
+// PARTIAL: {{dwp-index-cache.cpp.tmp.somenames.*-dwarf-index-partial-}}
+
+#if MAIN
+extern int foo();
+int main() { return foo(); }
+#else
+int foo() { return 0; }
+#endif
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
index 2ed7b219d8da3..af49206608723 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/find-basic-type.cpp
@@ -4,6 +4,8 @@
 // RUN: ld.lld %t.o -o %t
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
+// RUN: lldb-test symbols --name=::foo --find=type %t | \
+// RUN:   FileCheck --check-prefix=EXACT %s
 // RUN: lldb-test symbols --name=foo --context=context --find=type %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
@@ -12,6 +14,8 @@
 // RUN: %clang %s -g -c -o %t --target=x86_64-apple-macosx
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
+// RUN: lldb-test symbols --name=::foo --find=type %t | \
+// RUN:   FileCheck --check-prefix=EXACT %s
 // RUN: lldb-test symbols --name=foo --context=context --find=type %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
@@ -22,6 +26,8 @@
 // RUN: llvm-readobj --sections %t | FileCheck %s --check-prefix NAMES
 // RUN: lldb-test symbols --name=foo --find=type %t | \
 // RUN:   FileCheck --check-prefix=NAME %s
+// RUN: lldb-test symbols --name=::foo --find=type %t | \
+// RUN:   FileCheck --check-prefix=EXACT %s
 // RUN: lldb-test symbols --name=foo --context=context --find=type %t | \
 // RUN:   FileCheck --check-prefix=CONTEXT %s
 // RUN: lldb-test symbols --name=not_there --find=type %t | \
@@ -31,9 +37,11 @@
 
 // EMPTY: Found 0 types:
 // NAME: Found 4 types:
+// EXACT: Found 1 types:
 // CONTEXT: Found 1 types:
 struct foo { };
 // NAME-DAG: name = "foo", {{.*}} decl = find-basic-type.cpp:[[@LINE-1]]
+// EXACT-DAG: name = "foo", {{.*}} decl = find-basic-type.cpp:[[@LINE-2]]
 
 namespace bar {
 int context;
diff --git a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
index 1cb20a4036382..9057d01c25840 100644
--- a/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
+++ b/lldb/test/Shell/SymbolFile/PDB/function-nested-block.test
@@ -2,7 +2,6 @@ REQUIRES: system-windows, lld
 RUN: %build --compiler=clang-cl --nodefaultlib --output=%t.exe %S/Inputs/FunctionNestedBlockTest.cpp
 RUN: lldb-test symbols -find=function -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-FUNCTION %s
 RUN: lldb-test symbols -find=block -file FunctionNestedBlockTest.cpp -line 4 %t.exe | FileCheck --check-prefix=CHECK-BLOCK %s
-XFAIL: *
 
 CHECK-FUNCTION: Found 1 functions:
 CHECK-FUNCTION: name = "{{.*}}", mangled = "{{_?}}main"
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 42b5f501e32c6..8196dfdd5073c 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -6,9 +6,9 @@ The extension requires the `lldb-dap` (formerly `lldb-vscode`) binary.
 This binary is not packaged with the VS Code extension.
 
 There are multiple ways to obtain this binary:
-* use the binary provided by your toolchain (for example `xcrun -f lldb-dap` on macOS) or contact your toolchain vendor to include it.
-* download one of the relase packages from the [LLVM release page](https://github.com/llvm/llvm-project/releases/). The `LLVM-19.1.0-{operating_system}.tar.xz` packages contain a prebuilt `lldb-dap` binary.
-* build it from source (see [LLDB's build instructions](https://lldb.llvm.org/resources/build.html))
+* Use the binary provided by your toolchain (for example `xcrun -f lldb-dap` on macOS) or contact your toolchain vendor to include it.
+* Download one of the relase packages from the [LLVM release page](https://github.com/llvm/llvm-project/releases/). The `LLVM-19.1.0-{operating_system}.tar.xz` packages contain a prebuilt `lldb-dap` binary.
+* Build it from source (see [LLDB's build instructions](https://lldb.llvm.org/resources/build.html)).
 
 By default, the VS Code extension will expect to find `lldb-dap` in your `PATH`.
 Alternatively, you can explictly specify the location of the `lldb-dap` binary using the `lldb-dap.executable-path` setting.
@@ -179,26 +179,26 @@ The default hostname being used `localhost`.
 For both launch and attach configurations, lldb-dap accepts the following `lldb-dap`
 specific key/value pairs:
 
-|parameter          |type|req |         |
-|-------------------|----|:--:|---------|
-|**name**           |string|Y| A configuration name that will be displayed in the IDE.
-|**type**           |string|Y| Must be "lldb-dap".
-|**request**        |string|Y| Must be "launch" or "attach".
-|**program**        |string|Y| Path to the executable to launch.
-|**sourcePath**     |string| | Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths.
-|**sourceMap**      |[string[2]]| | Specify an array of path re-mappings. Each element in the array must be a two element array containing a source and destination pathname. Overrides sourcePath.
-|**debuggerRoot**   | string| |Specify a working directory to use when launching lldb-dap. If the debug information in your executable contains relative paths, this option can be used so that `lldb-dap` can find source files and object files that have relative paths.
-|**commandEscapePrefix** | string | | The escape prefix to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a backtick. If it's an empty string, then all expression in the Debug Console are treated as regular LLDB commands.
-|**customFrameFormat** | string | | If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.
-|**customThreadFormat** | string | | Same as `customFrameFormat`, but for threads instead of stack frames.
-|**displayExtendedBacktrace**|bool| | Enable language specific extended backtraces.
-|**enableAutoVariableSummaries**|bool| | Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.
-|**enableSyntheticChildDebugging**|bool| | If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.
-|**initCommands**   |[string]| | LLDB commands executed upon debugger startup prior to creating the LLDB target.
-|**preRunCommands** |[string]| | LLDB commands executed just before launching/attaching, after the LLDB target has been created.
-|**stopCommands**   |[string]| | LLDB commands executed just after each stop.
-|**exitCommands**   |[string]| | LLDB commands executed when the program exits.
-|**terminateCommands** |[string]| | LLDB commands executed when the debugging session ends.
+| Parameter                         | Type        | Req |         |
+|-----------------------------------|-------------|:---:|---------|
+| **name**                          | string      | Y   | A configuration name that will be displayed in the IDE.
+| **type**                          | string      | Y   | Must be "lldb-dap".
+| **request**                       | string      | Y   | Must be "launch" or "attach".
+| **program**                       | string      | Y   | Path to the executable to launch.
+| **sourcePath**                    | string      |     | Specify a source path to remap \"./\" to allow full paths to be used when setting breakpoints in binaries that have relative source paths.
+| **sourceMap**                     | [string[2]] |     | Specify an array of path re-mappings. Each element in the array must be a two element array containing a source and destination pathname. Overrides sourcePath.
+| **debuggerRoot**                  | string      |     | Specify a working directory to use when launching lldb-dap. If the debug information in your executable contains relative paths, this option can be used so that `lldb-dap` can find source files and object files that have relative paths.
+| **commandEscapePrefix**           | string      |     | The escape prefix to use for executing regular LLDB commands in the Debug Console, instead of printing variables. Defaults to a backtick. If it's an empty string, then all expression in the Debug Console are treated as regular LLDB commands.
+| **customFrameFormat**             | string      |     | If non-empty, stack frames will have descriptions generated based on the provided format. See https://lldb.llvm.org/use/formatting.html for an explanation on format strings for frames. If the format string contains errors, an error message will be displayed on the Debug Console and the default frame names will be used. This might come with a performance cost because debug information might need to be processed to generate the description.
+| **customThreadFormat**            | string      |     | Same as `customFrameFormat`, but for threads instead of stack frames.
+| **displayExtendedBacktrace**      | bool        |     | Enable language specific extended backtraces.
+| **enableAutoVariableSummaries**   | bool        |     | Enable auto generated summaries for variables when no summaries exist for a given type. This feature can cause performance delays in large projects when viewing variables.
+| **enableSyntheticChildDebugging** | bool        |     | If a variable is displayed using a synthetic children, also display the actual contents of the variable at the end under a [raw] entry. This is useful when creating sythetic child plug-ins as it lets you see the actual contents of the variable.
+| **initCommands**                  | [string]    |     | LLDB commands executed upon debugger startup prior to creating the LLDB target.
+| **preRunCommands**                | [string]    |     | LLDB commands executed just before launching/attaching, after the LLDB target has been created.
+| **stopCommands**                  | [string]    |     | LLDB commands executed just after each stop.
+| **exitCommands**                  | [string]    |     | LLDB commands executed when the program exits.
+| **terminateCommands**             | [string]    |     | LLDB commands executed when the debugging session ends.
 
 All commands and command outputs will be sent to the debugger console when they are executed.
 Commands can be prefixed with `?` or `!` to modify their behavior:
@@ -208,25 +208,25 @@ Commands can be prefixed with `?` or `!` to modify their behavior:
 For JSON configurations of `"type": "launch"`, the JSON configuration can additionally
 contain the following key/value pairs:
 
-|parameter          |type|req |         |
-|-------------------|----|:--:|---------|
-|**program**        |string|Y| Path to the executable to launch.
-|**args**           |[string]|| An array of command line argument strings to be passed to the program being launched.
-|**cwd**            |string| | The program working directory.
-|**env**            |dictionary| | Environment variables to set when launching the program. The format of each environment variable string is "VAR=VALUE" for environment variables with values or just "VAR" for environment variables with no values.
-|**stopOnEntry**    |boolean| | Whether to stop program immediately after launching.
-|**runInTerminal**  |boolean| | Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs.
-|**launchCommands** |[string]| | LLDB commands executed to launch the program.
+| Parameter                         | Type        | Req |         |
+|-----------------------------------|-------------|:---:|---------|
+| **program**                       | string      | Y   | Path to the executable to launch.
+| **args**                          | [string]    |     | An array of command line argument strings to be passed to the program being launched.
+| **cwd**                           | string      |     | The program working directory.
+| **env**                           | dictionary  |     | Environment variables to set when launching the program. The format of each environment variable string is "VAR=VALUE" for environment variables with values or just "VAR" for environment variables with no values.
+| **stopOnEntry**                   | boolean     |     | Whether to stop program immediately after launching.
+| **runInTerminal**                 | boolean     |     | Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs.
+| **launchCommands**                | [string]    |     | LLDB commands executed to launch the program.
 
 For JSON configurations of `"type": "attach"`, the JSON configuration can contain
 the following `lldb-dap` specific key/value pairs:
 
-|parameter          |type    |req |         |
-|-------------------|--------|:--:|---------|
-|**program**        |string  | | Path to the executable to attach to. This value is optional but can help to resolve breakpoints prior the attaching to the program.
-|**pid**            |number  | | The process id of the process you wish to attach to. If **pid** is omitted, the debugger will attempt to attach to the program by finding a process whose file name matches the file name from **porgram**. Setting this value to `${command:pickMyProcess}` will allow interactive process selection in the IDE.
-|**waitFor**        |boolean | | Wait for the process to launch.
-|**attachCommands** |[string]| | LLDB commands that will be executed after **preRunCommands** which take place of the code that normally does the attach. The commands can create a new target and attach or launch it however desired. This allows custom launch and attach configurations. Core files can use `target create --core /path/to/core` to attach to core files.
+| Parameter                         | Type        | Req |         |
+|-----------------------------------|-------------|:---:|---------|
+| **program**                       | string      |     | Path to the executable to attach to. This value is optional but can help to resolve breakpoints prior the attaching to the program.
+| **pid**                           | number      |     | The process id of the process you wish to attach to. If **pid** is omitted, the debugger will attempt to attach to the program by finding a process whose file name matches the file name from **porgram**. Setting this value to `${command:pickMyProcess}` will allow interactive process selection in the IDE.
+| **waitFor**                       | boolean     |     | Wait for the process to launch.
+| **attachCommands**                | [string]    |     | LLDB commands that will be executed after **preRunCommands** which take place of the code that normally does the attach. The commands can create a new target and attach or launch it however desired. This allows custom launch and attach configurations. Core files can use `target create --core /path/to/core` to attach to core files.
 
 ## Debug Console
 
@@ -295,7 +295,7 @@ and may also be adjusted at runtime using the lldb command
 lldb-dap includes a command to trigger a Debug Adapter Protocol event
 from a script.
 
-The event maybe a custom DAP event or a standard event, if the event is not 
+The event maybe a custom DAP event or a standard event, if the event is not
 handled internally by `lldb-dap`.
 
 This command has the format:
@@ -316,9 +316,9 @@ For example you can use a launch configuration hook to trigger custom events lik
 }
 ```
 
-[See the specification](https://microsoft.github.io/debug-adapter-protocol/specification#Base_Protocol_Event) 
-for more details on Debug Adapter Protocol events and the VS Code 
-[debug.onDidReceiveDebugSessionCustomEvent](https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent) 
+[See the specification](https://microsoft.github.io/debug-adapter-protocol/specification#Base_Protocol_Event)
+for more details on Debug Adapter Protocol events and the VS Code
+[debug.onDidReceiveDebugSessionCustomEvent](https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent)
 API for handling a custom event from an extension.
 
 ## Contributing
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 9155163c65ba5..5e9a7de9109ec 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -44,7 +44,7 @@
     "format": "npx prettier './src-ts/' --write",
     "package": "vsce package --out ./out/lldb-dap.vsix",
     "publish": "vsce publish",
-    "vscode-uninstall": "code --uninstall-extension llvm.lldb-dap",
+    "vscode-uninstall": "code --uninstall-extension llvm-vs-code-extensions.lldb-dap",
     "vscode-install": "code --install-extension ./out/lldb-dap.vsix"
   },
   "contributes": {
@@ -513,4 +513,4 @@
       }
     ]
   }
-}
\ No newline at end of file
+}
diff --git a/lldb/unittests/Host/SocketTest.cpp b/lldb/unittests/Host/SocketTest.cpp
index b20cfe5464028..a74352c19725d 100644
--- a/lldb/unittests/Host/SocketTest.cpp
+++ b/lldb/unittests/Host/SocketTest.cpp
@@ -88,6 +88,28 @@ TEST_P(SocketTest, DomainListenConnectAccept) {
   CreateDomainConnectedSockets(Path, &socket_a_up, &socket_b_up);
 }
 
+TEST_P(SocketTest, DomainListenGetListeningConnectionURI) {
+  llvm::SmallString<64> Path;
+  std::error_code EC =
+      llvm::sys::fs::createUniqueDirectory("DomainListenConnectAccept", Path);
+  ASSERT_FALSE(EC);
+  llvm::sys::path::append(Path, "test");
+
+  // Skip the test if the $TMPDIR is too long to hold a domain socket.
+  if (Path.size() > 107u)
+    return;
+
+  auto listen_socket_up = std::make_unique<DomainSocket>(
+      /*should_close=*/true);
+  Status error = listen_socket_up->Listen(Path, 5);
+  ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded());
+  ASSERT_TRUE(listen_socket_up->IsValid());
+
+  ASSERT_THAT(
+      listen_socket_up->GetListeningConnectionURI(),
+      testing::ElementsAre(llvm::formatv("unix-connect://{0}", Path).str()));
+}
+
 TEST_P(SocketTest, DomainMainLoopAccept) {
   llvm::SmallString<64> Path;
   std::error_code EC =
@@ -225,12 +247,29 @@ TEST_P(SocketTest, TCPListen0GetPort) {
   if (!HostSupportsIPv4())
     return;
   llvm::Expected<std::unique_ptr<TCPSocket>> sock =
-      Socket::TcpListen("10.10.12.3:0", false);
+      Socket::TcpListen("10.10.12.3:0", 5);
   ASSERT_THAT_EXPECTED(sock, llvm::Succeeded());
   ASSERT_TRUE(sock.get()->IsValid());
   EXPECT_NE(sock.get()->GetLocalPortNumber(), 0);
 }
 
+TEST_P(SocketTest, TCPListen0GetListeningConnectionURI) {
+  if (!HostSupportsProtocol())
+    return;
+
+  std::string addr = llvm::formatv("[{0}]:0", GetParam().localhost_ip).str();
+  llvm::Expected<std::unique_ptr<TCPSocket>> sock = Socket::TcpListen(addr);
+  ASSERT_THAT_EXPECTED(sock, llvm::Succeeded());
+  ASSERT_TRUE(sock.get()->IsValid());
+
+  EXPECT_THAT(
+      sock.get()->GetListeningConnectionURI(),
+      testing::ElementsAre(llvm::formatv("connection://[{0}]:{1}",
+                                         GetParam().localhost_ip,
+                                         sock->get()->GetLocalPortNumber())
+                               .str()));
+}
+
 TEST_P(SocketTest, TCPGetConnectURI) {
   std::unique_ptr<TCPSocket> socket_a_up;
   std::unique_ptr<TCPSocket> socket_b_up;
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 569c01b61daac..f14065ab03799 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -186,13 +186,8 @@ if(LIBC_GPU_BUILD)
   list(APPEND RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "libc")
 endif()
 
-set(NEED_LIBC_HDRGEN FALSE)
-if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
-  set(NEED_LIBC_HDRGEN TRUE)
-endif()
 foreach(_name ${LLVM_RUNTIME_TARGETS})
   if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
-    set(NEED_LIBC_HDRGEN TRUE)
     if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
       set(LLVM_LIBC_GPU_BUILD ON)
     endif()
@@ -202,27 +197,11 @@ if("${LIBC_TARGET_TRIPLE}" STREQUAL "amdgcn-amd-amdhsa" OR
    "${LIBC_TARGET_TRIPLE}" STREQUAL "nvptx64-nvidia-cuda")
   set(LLVM_LIBC_GPU_BUILD ON)
 endif()
-if(NEED_LIBC_HDRGEN)
-  # To build the libc runtime, we need to be able to build few libc build
-  # tools from the "libc" project. So, we add it to the list of enabled
-  # projects.
-  if (NOT "libc" IN_LIST LLVM_ENABLE_PROJECTS)
-    message(STATUS "Enabling libc project to build libc build tools")
-    list(APPEND LLVM_ENABLE_PROJECTS "libc")
-  endif()
+if (NOT "libc" IN_LIST LLVM_ENABLE_PROJECTS AND LLVM_LIBC_GPU_BUILD)
+  message(STATUS "Enabling libc project to build libc testing tools")
+  list(APPEND LLVM_ENABLE_PROJECTS "libc")
 endif()
 
-foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
-  if("${proj}" IN_LIST LLVM_ENABLE_PROJECTS)
-    # The 'libc' project bootstraps a few executables via the project build and
-    # should not emit an error currently.
-    if(NOT (NEED_LIBC_HDRGEN AND "${proj}" STREQUAL "libc"))
-      message(FATAL_ERROR "Runtime project \"${proj}\" found in LLVM_ENABLE_PROJECTS and LLVM_ENABLE_RUNTIMES. It must only appear in one of them and that one should almost always be LLVM_ENABLE_RUNTIMES.")
-    endif()
-  endif()
-endforeach()
-unset(NEED_LIBC_HDRGEN)
-
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for
 # several reasons:
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index c5f98f76bda31..63bdfd42528db 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -434,10 +434,12 @@ Others only have a lead maintainer listed here.
 
 [Flang maintainers](https://github.com/llvm/llvm-project/blob/main/flang/Maintainers.txt)
 
-[LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/CODE_OWNERS.TXT)
+[LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/Maintainers.md)
 
 [LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/Maintainers.rst)
 
+[LLVM OpenMP Library maintainers](https://github.com/llvm/llvm-project/blob/main/openmp/Maintainers.md)
+
 #### libc++
 
 Louis Dionne \
@@ -448,11 +450,6 @@ ldionne.2@gmail.com (email), [ldionne](https://github.com/ldionne) (GitHub)
 Tom Stellard \
 tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
 
-#### OpenMP (runtime library)
-
-Andrey Churbanov \
-andrey.churbanov@intel.com (email), [AndreyChurbanov](https://github.com/AndreyChurbanov) (GitHub)
-
 #### Polly
 
 Tobias Grosser \
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index e36a71f522d82..c22d185349dcc 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -78,6 +78,9 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
       list(APPEND libc_flags -DLLVM_FORCE_BUILD_RUNTIME=ON)
     endif()
   endif()
+  if(LLVM_LIBC_GPU_BUILD)
+    list(APPEND libc_flags -DLLVM_LIBC_GPU_BUILD=ON)
+  endif()
 
   add_custom_command(OUTPUT ${${project_name}_${target_name}_BUILD}/CMakeCache.txt
     COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 411a1209ef947..c076b877bc8a1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -586,7 +586,7 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
      ``gfx9-4-generic``   ``amdgcn``     - ``gfx940``      - xnack            - Absolute flat   FP8 and BF8 instructions,
                                          - ``gfx941``      - sramecc            scratch         FP8 and BF8 conversion instructions,
                                          - ``gfx942``                                           as well as instructions with XF32 format support
-                                                                                                are not available.
+                                         - ``gfx950``                                           are not available.
 
 
      ``gfx10-1-generic``  ``amdgcn``     - ``gfx1010``     - xnack            - Absolute flat   - The following instructions are
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index dc3f3aeb735f8..d8d9c4fc4bb8a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -226,9 +226,15 @@ and `-mbulk-memory` flags, which correspond to the [Bulk Memory Operations]
 and [Non-trapping float-to-int Conversions] language features, which are
 [widely implemented in engines].
 
+A new Lime1 target CPU is added, -mcpu=lime1. This CPU follows the definition of
+the Lime1 CPU [here], and enables -mmultivalue, -mmutable-globals,
+-mcall-indirect-overlong, -msign-ext, -mbulk-memory-opt, -mnontrapping-fptoint,
+and -mextended-const.
+
 [Bulk Memory Operations]: https://github.com/WebAssembly/bulk-memory-operations/blob/master/proposals/bulk-memory-operations/Overview.md
 [Non-trapping float-to-int Conversions]: https://github.com/WebAssembly/spec/blob/master/proposals/nontrapping-float-to-int-conversion/Overview.md
 [widely implemented in engines]: https://webassembly.org/features/
+[here]: https://github.com/WebAssembly/tool-conventions/blob/main/Lime.md#lime1
 
 Changes to the Windows Target
 -----------------------------
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index ffb04dca00fc5..28e919fdf516a 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -141,16 +141,18 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
 
    * - Extension Name
      - Description
+   * - ``SPV_EXT_arithmetic_fence``
+     - Adds an instruction that prevents fast-math optimizations between its argument and the expression that contains it.
+   * - ``SPV_EXT_demote_to_helper_invocation``
+     - Adds an instruction that demotes a fragment shader invocation to a helper invocation.
+   * - ``SPV_EXT_optnone``
+     - Adds OptNoneEXT value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_EXT_shader_atomic_float16_add``
      - Extends the SPV_EXT_shader_atomic_float_add extension to support atomically adding to 16-bit floating-point numbers in memory.
    * - ``SPV_EXT_shader_atomic_float_add``
      - Adds atomic add instruction on floating-point numbers.
    * - ``SPV_EXT_shader_atomic_float_min_max``
      - Adds atomic min and max instruction on floating-point numbers.
-   * - ``SPV_EXT_arithmetic_fence``
-     - Adds an instruction that prevents fast-math optimizations between its argument and the expression that contains it.
-   * - ``SPV_EXT_demote_to_helper_invocation``
-     - Adds an instruction that demotes a fragment shader invocation to a helper invocation.
    * - ``SPV_INTEL_arbitrary_precision_integers``
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bfloat16_conversion``
@@ -165,6 +167,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Adds decorations that can be applied to global (module scope) variables.
    * - ``SPV_INTEL_global_variable_fpga_decorations``
      - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
+   * - ``SPV_INTEL_media_block_io``
+     - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image.
    * - ``SPV_INTEL_optnone``
      - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_INTEL_split_barrier``
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index 449852343964c..01eeadb17db9f 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -113,7 +113,9 @@ class ConstraintSystem {
   static SmallVector<int64_t, 8> negate(SmallVector<int64_t, 8> R) {
     // The negated constraint R is obtained by multiplying by -1 and adding 1 to
     // the constant.
-    R[0] += 1;
+    if (AddOverflow(R[0], int64_t(1), R[0]))
+      return {};
+
     return negateOrEqual(R);
   }
 
diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
index 430c3edc2f0dc..d4ae4dcc918cf 100644
--- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h
+++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
+#include "llvm/IR/CmpPredicate.h"
 #include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/Instruction.h"
 
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index cf7d3e044188a..fa291eeef198b 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -44,6 +44,7 @@ class DataLayout;
 class DominatorTree;
 class Function;
 class Instruction;
+class CmpPredicate;
 class LoadInst;
 struct LoopStandardAnalysisResults;
 class Pass;
@@ -152,11 +153,11 @@ Value *simplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 Value *simplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
 
 /// Given operands for an ICmpInst, fold the result or return null.
-Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyICmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
                         const SimplifyQuery &Q);
 
 /// Given operands for an FCmpInst, fold the result or return null.
-Value *simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                         FastMathFlags FMF, const SimplifyQuery &Q);
 
 /// Given operands for a SelectInst, fold the result or return null.
@@ -200,7 +201,7 @@ Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, ArrayRef<int> Mask,
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a CmpInst, fold the result or return null.
-Value *simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q);
 
 /// Given operand for a UnaryOperator, fold the result or return null.
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index bd74d27e0c49b..8aa024a72afc8 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -159,7 +159,7 @@ bool isKnownPositive(const Value *V, const SimplifyQuery &SQ,
 
 /// Returns true if the given value is known be negative (i.e. non-positive
 /// and non-zero).
-bool isKnownNegative(const Value *V, const SimplifyQuery &DL,
+bool isKnownNegative(const Value *V, const SimplifyQuery &SQ,
                      unsigned Depth = 0);
 
 /// Return true if the given values are known to be non-equal when defined.
@@ -180,7 +180,7 @@ bool isKnownNonEqual(const Value *V1, const Value *V2, const DataLayout &DL,
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 bool MaskedValueIsZero(const Value *V, const APInt &Mask,
-                       const SimplifyQuery &DL, unsigned Depth = 0);
+                       const SimplifyQuery &SQ, unsigned Depth = 0);
 
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign
@@ -1255,8 +1255,7 @@ std::optional<bool> isImpliedCondition(const Value *LHS, const Value *RHS,
                                        const DataLayout &DL,
                                        bool LHSIsTrue = true,
                                        unsigned Depth = 0);
-std::optional<bool> isImpliedCondition(const Value *LHS,
-                                       CmpInst::Predicate RHSPred,
+std::optional<bool> isImpliedCondition(const Value *LHS, CmpPredicate RHSPred,
                                        const Value *RHSOp0, const Value *RHSOp1,
                                        const DataLayout &DL,
                                        bool LHSIsTrue = true,
@@ -1267,8 +1266,8 @@ std::optional<bool> isImpliedCondition(const Value *LHS,
 std::optional<bool> isImpliedByDomCondition(const Value *Cond,
                                             const Instruction *ContextI,
                                             const DataLayout &DL);
-std::optional<bool> isImpliedByDomCondition(CmpInst::Predicate Pred,
-                                            const Value *LHS, const Value *RHS,
+std::optional<bool> isImpliedByDomCondition(CmpPredicate Pred, const Value *LHS,
+                                            const Value *RHS,
                                             const Instruction *ContextI,
                                             const DataLayout &DL);
 
diff --git a/llvm/include/llvm/CodeGen/LiveDebugVariables.h b/llvm/include/llvm/CodeGen/LiveDebugVariables.h
index a4b5a87fd3887..2cb95f2c71ccd 100644
--- a/llvm/include/llvm/CodeGen/LiveDebugVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveDebugVariables.h
@@ -21,7 +21,10 @@
 #define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 namespace llvm {
 
@@ -29,15 +32,15 @@ template <typename T> class ArrayRef;
 class LiveIntervals;
 class VirtRegMap;
 
-class LiveDebugVariables : public MachineFunctionPass {
-  void *pImpl = nullptr;
+class LiveDebugVariables {
 
 public:
-  static char ID; // Pass identification, replacement for typeid
-
+  class LDVImpl;
   LiveDebugVariables();
-  ~LiveDebugVariables() override;
+  ~LiveDebugVariables();
+  LiveDebugVariables(LiveDebugVariables &&);
 
+  void analyze(MachineFunction &MF, LiveIntervals *LIS);
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
@@ -49,12 +52,39 @@ class LiveDebugVariables : public MachineFunctionPass {
   /// @param VRM Rename virtual registers according to map.
   void emitDebugValues(VirtRegMap *VRM);
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// dump - Print data structures to dbgs().
   void dump() const;
+#endif
+
+  void print(raw_ostream &OS) const;
+
+  void releaseMemory();
+
+  bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA,
+                  MachineFunctionAnalysisManager::Invalidator &Inv);
 
 private:
+  std::unique_ptr<LDVImpl> PImpl;
+};
+
+class LiveDebugVariablesWrapperLegacy : public MachineFunctionPass {
+  std::unique_ptr<LiveDebugVariables> Impl;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  LiveDebugVariablesWrapperLegacy();
+
   bool runOnMachineFunction(MachineFunction &) override;
-  void releaseMemory() override;
+
+  LiveDebugVariables &getLDV() { return *Impl; }
+  const LiveDebugVariables &getLDV() const { return *Impl; }
+
+  void releaseMemory() override {
+    if (Impl)
+      Impl->releaseMemory();
+  }
   void getAnalysisUsage(AnalysisUsage &) const override;
 
   MachineFunctionProperties getSetProperties() const override {
@@ -63,6 +93,32 @@ class LiveDebugVariables : public MachineFunctionPass {
   }
 };
 
+class LiveDebugVariablesAnalysis
+    : public AnalysisInfoMixin<LiveDebugVariablesAnalysis> {
+  friend AnalysisInfoMixin<LiveDebugVariablesAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = LiveDebugVariables;
+
+  MachineFunctionProperties getSetProperties() const {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::TracksDebugUserValues);
+  }
+
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+class LiveDebugVariablesPrinterPass
+    : public PassInfoMixin<LiveDebugVariablesPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  LiveDebugVariablesPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 6a41094ff933b..a207f3886bd0e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2292,7 +2292,7 @@ class TargetLoweringBase {
   virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const {}
 
   /// Returns true if arguments should be sign-extended in lib calls.
-  virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+  virtual bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const {
     return IsSigned;
   }
 
@@ -4713,18 +4713,18 @@ class TargetLowering : public TargetLoweringBase {
     // shouldExtendTypeInLibCall can get the original type before soften.
     ArrayRef<EVT> OpsVTBeforeSoften;
     EVT RetVTBeforeSoften;
-    bool IsSExt : 1;
+    bool IsSigned : 1;
     bool DoesNotReturn : 1;
     bool IsReturnValueUsed : 1;
     bool IsPostTypeLegalization : 1;
     bool IsSoften : 1;
 
     MakeLibCallOptions()
-        : IsSExt(false), DoesNotReturn(false), IsReturnValueUsed(true),
+        : IsSigned(false), DoesNotReturn(false), IsReturnValueUsed(true),
           IsPostTypeLegalization(false), IsSoften(false) {}
 
-    MakeLibCallOptions &setSExt(bool Value = true) {
-      IsSExt = Value;
+    MakeLibCallOptions &setIsSigned(bool Value = true) {
+      IsSigned = Value;
       return *this;
     }
 
diff --git a/llvm/include/llvm/Config/abi-breaking.h.cmake b/llvm/include/llvm/Config/abi-breaking.h.cmake
index 81495f0569752..2d27e02b1d545 100644
--- a/llvm/include/llvm/Config/abi-breaking.h.cmake
+++ b/llvm/include/llvm/Config/abi-breaking.h.cmake
@@ -12,8 +12,6 @@
 #ifndef LLVM_ABI_BREAKING_CHECKS_H
 #define LLVM_ABI_BREAKING_CHECKS_H
 
-#include "llvm/Support/Compiler.h"
-
 /* Define to enable checks that alter the LLVM C++ ABI */
 #cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
 
@@ -45,12 +43,12 @@
 #endif
 namespace llvm {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
-LLVM_ABI extern int EnableABIBreakingChecks;
+extern int EnableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyEnableABIBreakingChecks =
     &EnableABIBreakingChecks;
 #else
-LLVM_ABI extern int DisableABIBreakingChecks;
+extern int DisableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyDisableABIBreakingChecks =
     &DisableABIBreakingChecks;
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 9844214c537a0..2831ebb3be798 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -393,10 +393,13 @@ const char *getLinkageName(Linkage L);
 /// Defines the scope in which this symbol should be visible:
 ///   Default -- Visible in the public interface of the linkage unit.
 ///   Hidden -- Visible within the linkage unit, but not exported from it.
+///   SideEffectsOnly -- Like hidden, but symbol can only be looked up once
+///                      to trigger materialization of the containing graph.
 ///   Local -- Visible only within the LinkGraph.
 enum class Scope : uint8_t {
   Default,
   Hidden,
+  SideEffectsOnly,
   Local
 };
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
index 035139578e08f..fb66bf812f610 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h
@@ -29,12 +29,6 @@ namespace orc {
 
 // --raw_ostream operators for ORC types--
 
-/// Render a SymbolStringPtr.
-raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
-
-/// Render a NonOwningSymbolStringPtr.
-raw_ostream &operator<<(raw_ostream &OS, NonOwningSymbolStringPtr Sym);
-
 /// Render a SymbolNameSet.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
index f47956a65f2e7..85f08b53f74a3 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
@@ -92,6 +92,9 @@ class SymbolStringPtrBase {
     return LHS.S < RHS.S;
   }
 
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const SymbolStringPtrBase &Sym);
+
 #ifndef NDEBUG
   // Returns true if the pool entry's ref count is above zero (or if the entry
   // is an empty or tombstone value). Useful for debugging and testing -- this
diff --git a/llvm/include/llvm/IR/CmpPredicate.h b/llvm/include/llvm/IR/CmpPredicate.h
new file mode 100644
index 0000000000000..4b1be7beb2b66
--- /dev/null
+++ b/llvm/include/llvm/IR/CmpPredicate.h
@@ -0,0 +1,62 @@
+//===- CmpPredicate.h - CmpInst Predicate with samesign information -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A CmpInst::Predicate with any samesign information (applicable to ICmpInst).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_CMPPREDICATE_H
+#define LLVM_IR_CMPPREDICATE_H
+
+#include "llvm/IR/InstrTypes.h"
+
+namespace llvm {
+/// An abstraction over a floating-point predicate, and a pack of an integer
+/// predicate with samesign information. Some functions in ICmpInst construct
+/// and return this type in place of a Predicate.
+class CmpPredicate {
+  CmpInst::Predicate Pred;
+  bool HasSameSign;
+
+public:
+  /// Constructed implictly with a either Predicate and samesign information, or
+  /// just a Predicate, dropping samesign information.
+  CmpPredicate(CmpInst::Predicate Pred, bool HasSameSign = false)
+      : Pred(Pred), HasSameSign(HasSameSign) {
+    assert(!HasSameSign || CmpInst::isIntPredicate(Pred));
+  }
+
+  /// Implictly converts to the underlying Predicate, dropping samesign
+  /// information.
+  operator CmpInst::Predicate() const { return Pred; }
+
+  /// Query samesign information, for optimizations.
+  bool hasSameSign() const { return HasSameSign; }
+
+  /// Compares two CmpPredicates taking samesign into account and returns the
+  /// canonicalized CmpPredicate if they match. An alternative to operator==.
+  ///
+  /// For example,
+  ///   samesign ult + samesign ult -> samesign ult
+  ///   samesign ult + ult -> ult
+  ///   samesign ult + slt -> slt
+  ///   ult + ult -> ult
+  ///   ult + slt -> std::nullopt
+  static std::optional<CmpPredicate> getMatching(CmpPredicate A,
+                                                 CmpPredicate B);
+
+  /// An operator== on the underlying Predicate.
+  bool operator==(CmpInst::Predicate P) const { return Pred == P; }
+
+  /// There is no operator== defined on CmpPredicate. Use getMatching instead to
+  /// get the canonicalized matching CmpPredicate.
+  bool operator==(CmpPredicate) const = delete;
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 605964af5d676..a42bf6bca1b9f 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CmpPredicate.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GEPNoWrapFlags.h"
@@ -1203,6 +1204,33 @@ class ICmpInst: public CmpInst {
 #endif
   }
 
+  /// @returns the predicate along with samesign information.
+  CmpPredicate getCmpPredicate() const {
+    return {getPredicate(), hasSameSign()};
+  }
+
+  /// @returns the inverse predicate along with samesign information: static
+  /// variant.
+  static CmpPredicate getInverseCmpPredicate(CmpPredicate Pred) {
+    return {getInversePredicate(Pred), Pred.hasSameSign()};
+  }
+
+  /// @returns the inverse predicate along with samesign information.
+  CmpPredicate getInverseCmpPredicate() const {
+    return getInverseCmpPredicate(getCmpPredicate());
+  }
+
+  /// @returns the swapped predicate along with samesign information: static
+  /// variant.
+  static CmpPredicate getSwappedCmpPredicate(CmpPredicate Pred) {
+    return {getSwappedPredicate(Pred), Pred.hasSameSign()};
+  }
+
+  /// @returns the swapped predicate.
+  Predicate getSwappedCmpPredicate() const {
+    return getSwappedPredicate(getCmpPredicate());
+  }
+
   /// For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
   /// @returns the predicate that would be the result if the operand were
   /// regarded as signed.
@@ -1212,7 +1240,7 @@ class ICmpInst: public CmpInst {
   }
 
   /// Return the signed version of the predicate: static variant.
-  static Predicate getSignedPredicate(Predicate pred);
+  static Predicate getSignedPredicate(Predicate Pred);
 
   /// For example, EQ->EQ, SLE->ULE, UGT->UGT, etc.
   /// @returns the predicate that would be the result if the operand were
@@ -1223,14 +1251,15 @@ class ICmpInst: public CmpInst {
   }
 
   /// Return the unsigned version of the predicate: static variant.
-  static Predicate getUnsignedPredicate(Predicate pred);
+  static Predicate getUnsignedPredicate(Predicate Pred);
 
-  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->EQ
   /// @returns the unsigned version of the signed predicate pred or
   ///          the signed version of the signed predicate pred.
-  static Predicate getFlippedSignednessPredicate(Predicate pred);
+  /// Static variant.
+  static Predicate getFlippedSignednessPredicate(Predicate Pred);
 
-  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->Failed assert
+  /// For example, SLT->ULT, ULT->SLT, SLE->ULE, ULE->SLE, EQ->EQ
   /// @returns the unsigned version of the signed predicate pred or
   ///          the signed version of the signed predicate pred.
   Predicate getFlippedSignednessPredicate() const {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index a4eb75ceb6930..2a05c2ac0758c 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1913,6 +1913,11 @@ class ModuleSummaryIndex {
 
   /// Checks if we can import global variable from another module.
   bool canImportGlobalVar(const GlobalValueSummary *S, bool AnalyzeRefs) const;
+
+  /// Same as above but checks whether the global var is importable as a
+  /// declaration.
+  bool canImportGlobalVar(const GlobalValueSummary *S, bool AnalyzeRefs,
+                          bool &CanImportDecl) const;
 };
 
 /// GraphTraits definition to build SCC for the index
diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
index 071575137ff57..514dd6f174b90 100644
--- a/llvm/include/llvm/IR/StructuralHash.h
+++ b/llvm/include/llvm/IR/StructuralHash.h
@@ -31,6 +31,9 @@ class Module;
 /// to true includes instruction and operand type information.
 stable_hash StructuralHash(const Function &F, bool DetailedHash = false);
 
+/// Returns a hash of the global variable \p G.
+stable_hash StructuralHash(const GlobalVariable &G);
+
 /// Returns a hash of the module \p M by hashing all functions and global
 /// variables contained within. \param M The module to hash. \param DetailedHash
 /// Whether or not to encode additional information in the function hashes that
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 88bca2c75c949..7b81c9a8e143a 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -154,7 +154,7 @@ void initializeLegalizerPass(PassRegistry &);
 void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
 void initializeGISelKnownBitsAnalysisPass(PassRegistry &);
 void initializeLiveDebugValuesPass(PassRegistry &);
-void initializeLiveDebugVariablesPass(PassRegistry &);
+void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
 void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
 void initializeLiveRangeShrinkPass(PassRegistry &);
 void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 437ec39beb040..e65bd58dae96b 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -28,7 +28,7 @@ MODULE_PASS("global-merge", GlobalMergePass(TM, GlobalMergeOptions()))
 MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
 MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass())
-MODULE_PASS("print<regusage>", PhysicalRegisterUsageInfoPrinterPass(dbgs()))
+MODULE_PASS("print<regusage>", PhysicalRegisterUsageInfoPrinterPass(errs()))
 MODULE_PASS("shadow-stack-gc-lowering", ShadowStackGCLoweringPass())
 MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
 #undef MODULE_PASS
@@ -98,6 +98,7 @@ LOOP_PASS("loop-term-fold", LoopTermFoldPass())
 // computed. (We still either need to regenerate kill flags after regalloc, or
 // preferably fix the scavenger to not depend on them).
 MACHINE_FUNCTION_ANALYSIS("edge-bundles", EdgeBundlesAnalysis())
+MACHINE_FUNCTION_ANALYSIS("livedebugvars", LiveDebugVariablesAnalysis())
 MACHINE_FUNCTION_ANALYSIS("live-intervals", LiveIntervalsAnalysis())
 MACHINE_FUNCTION_ANALYSIS("live-reg-matrix", LiveRegMatrixAnalysis())
 MACHINE_FUNCTION_ANALYSIS("live-vars", LiveVariablesAnalysis())
@@ -146,19 +147,20 @@ MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass())
 MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass())
 MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass())
 MACHINE_FUNCTION_PASS("print", PrintMIRPass())
-MACHINE_FUNCTION_PASS("print<live-intervals>", LiveIntervalsPrinterPass(dbgs()))
-MACHINE_FUNCTION_PASS("print<live-vars>", LiveVariablesPrinterPass(dbgs()))
+MACHINE_FUNCTION_PASS("print<livedebugvars>", LiveDebugVariablesPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<live-intervals>", LiveIntervalsPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<live-vars>", LiveVariablesPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<machine-block-freq>",
-                      MachineBlockFrequencyPrinterPass(dbgs()))
+                      MachineBlockFrequencyPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<machine-branch-prob>",
-                      MachineBranchProbabilityPrinterPass(dbgs()))
+                      MachineBranchProbabilityPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<machine-dom-tree>",
-                      MachineDominatorTreePrinterPass(dbgs()))
-MACHINE_FUNCTION_PASS("print<machine-loops>", MachineLoopPrinterPass(dbgs()))
+                      MachineDominatorTreePrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<machine-loops>", MachineLoopPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("print<machine-post-dom-tree>",
-                      MachinePostDominatorTreePrinterPass(dbgs()))
-MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(dbgs()))
-MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(dbgs()))
+                      MachinePostDominatorTreePrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<slot-indexes>", SlotIndexesPrinterPass(errs()))
+MACHINE_FUNCTION_PASS("print<virtregmap>", VirtRegMapPrinterPass(errs()))
 MACHINE_FUNCTION_PASS("reg-usage-collector", RegUsageInfoCollectorPass())
 MACHINE_FUNCTION_PASS("reg-usage-propagation", RegUsageInfoPropagationPass())
 MACHINE_FUNCTION_PASS("require-all-machine-function-properties",
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index c5f7800097807..7133c0c6a302c 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -344,7 +344,9 @@ enum class InstrProfKind {
   MemProf = 0x40,
   // A temporal profile.
   TemporalProfile = 0x80,
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/TemporalProfile)
+  // A profile with loop entry basic blocks instrumentation.
+  LoopEntriesInstrumentation = 0x100,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/LoopEntriesInstrumentation)
 };
 
 const std::error_category &instrprof_category();
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index c66b0465a0b54..39613da81ecb4 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -730,10 +730,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
- * version for other variants of profile. We set the 8th most significant bit 
+ * version for other variants of profile. We set the 8th most significant bit
  * (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentation
  * generated profile, and 0 if this is a Clang FE generated profile.
  * 1 in bit 57 indicates there are context-sensitive records in the profile.
+ * The 54th bit indicates whether to always instrument loop entry blocks.
+ * The 58th bit indicates whether to always instrument function entry blocks.
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
@@ -742,6 +744,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define VARIANT_MASKS_ALL 0xffffffff00000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
+#define VARIANT_MASK_INSTR_LOOP_ENTRIES (0x1ULL << 55)
 #define VARIANT_MASK_IR_PROF (0x1ULL << 56)
 #define VARIANT_MASK_CSIR_PROF (0x1ULL << 57)
 #define VARIANT_MASK_INSTR_ENTRY (0x1ULL << 58)
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 1fad2343e2c96..330cf540c099b 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -123,6 +123,9 @@ class InstrProfReader {
 
   virtual bool instrEntryBBEnabled() const = 0;
 
+  /// Return true if the profile instruments all loop entries.
+  virtual bool instrLoopEntriesEnabled() const = 0;
+
   /// Return true if the profile has single byte counters representing coverage.
   virtual bool hasSingleByteCoverage() const = 0;
 
@@ -274,6 +277,11 @@ class TextInstrProfReader : public InstrProfReader {
                              InstrProfKind::FunctionEntryInstrumentation);
   }
 
+  bool instrLoopEntriesEnabled() const override {
+    return static_cast<bool>(ProfileKind &
+                             InstrProfKind::LoopEntriesInstrumentation);
+  }
+
   bool hasSingleByteCoverage() const override {
     return static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage);
   }
@@ -398,6 +406,10 @@ class RawInstrProfReader : public InstrProfReader {
     return (Version & VARIANT_MASK_INSTR_ENTRY) != 0;
   }
 
+  bool instrLoopEntriesEnabled() const override {
+    return (Version & VARIANT_MASK_INSTR_LOOP_ENTRIES) != 0;
+  }
+
   bool hasSingleByteCoverage() const override {
     return (Version & VARIANT_MASK_BYTE_COVERAGE) != 0;
   }
@@ -564,6 +576,7 @@ struct InstrProfReaderIndexBase {
   virtual bool isIRLevelProfile() const = 0;
   virtual bool hasCSIRLevelProfile() const = 0;
   virtual bool instrEntryBBEnabled() const = 0;
+  virtual bool instrLoopEntriesEnabled() const = 0;
   virtual bool hasSingleByteCoverage() const = 0;
   virtual bool functionEntryOnly() const = 0;
   virtual bool hasMemoryProfile() const = 0;
@@ -628,6 +641,10 @@ class InstrProfReaderIndex : public InstrProfReaderIndexBase {
     return (FormatVersion & VARIANT_MASK_INSTR_ENTRY) != 0;
   }
 
+  bool instrLoopEntriesEnabled() const override {
+    return (FormatVersion & VARIANT_MASK_INSTR_LOOP_ENTRIES) != 0;
+  }
+
   bool hasSingleByteCoverage() const override {
     return (FormatVersion & VARIANT_MASK_BYTE_COVERAGE) != 0;
   }
@@ -753,6 +770,10 @@ class IndexedInstrProfReader : public InstrProfReader {
     return Index->instrEntryBBEnabled();
   }
 
+  bool instrLoopEntriesEnabled() const override {
+    return Index->instrLoopEntriesEnabled();
+  }
+
   bool hasSingleByteCoverage() const override {
     return Index->hasSingleByteCoverage();
   }
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index fa30926c66258..fdb51c4ab4218 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -190,7 +190,9 @@ class InstrProfWriter {
       return make_error<InstrProfError>(instrprof_error::unsupported_version);
     }
     if (testIncompatible(InstrProfKind::FunctionEntryOnly,
-                         InstrProfKind::FunctionEntryInstrumentation)) {
+                         InstrProfKind::FunctionEntryInstrumentation) ||
+        testIncompatible(InstrProfKind::FunctionEntryOnly,
+                         InstrProfKind::LoopEntriesInstrumentation)) {
       return make_error<InstrProfError>(
           instrprof_error::unsupported_version,
           "cannot merge FunctionEntryOnly profiles and BB profiles together");
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 47487c9342594..6ffead4f13aeb 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -416,7 +416,7 @@ struct IndexedMemProfRecord {
   // the last entry in the list with the same function GUID.
   llvm::SmallVector<CallStackId> CallSiteIds;
 
-  void clear() { AllocSites.clear(); }
+  void clear() { *this = IndexedMemProfRecord(); }
 
   void merge(const IndexedMemProfRecord &Other) {
     // TODO: Filter out duplicates which may occur if multiple memprof
diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index ada0b8962881d..b5af0e0401ef2 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -198,10 +198,8 @@ class InstructionCost {
     return Value < RHS.Value;
   }
 
-  // Implement in terms of operator< to ensure that the two comparisons stay in
-  // sync
   bool operator==(const InstructionCost &RHS) const {
-    return !(*this < RHS) && !(RHS < *this);
+    return State == RHS.State && Value == RHS.Value;
   }
 
   bool operator!=(const InstructionCost &RHS) const { return !(*this == RHS); }
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index b0c63fc7c7b80..dbdc007d9c6fe 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -428,7 +428,7 @@ def unary_undef_to_zero: GICombineRule<
 // replaced with undef.
 def propagate_undef_any_op: GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC, G_BITCAST, G_ANYEXT):$root,
+  (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR, G_TRUNC, G_BITCAST):$root,
          [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]),
   (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
 
@@ -1857,6 +1857,26 @@ class integer_of_opcode<Instruction castOpcode> : GICombineRule <
 
 def integer_of_truncate : integer_of_opcode<G_TRUNC>;
 
+def anyext_undef: GICombineRule<
+   (defs root:$root),
+   (match (G_IMPLICIT_DEF $undef),
+          (G_ANYEXT $root, $undef):$Aext),
+   (apply [{ Helper.replaceInstWithUndef(*${Aext}); }])>;
+
+def zext_undef: GICombineRule<
+   (defs root:$root),
+   (match (G_IMPLICIT_DEF $undef),
+          (G_ZEXT $root, $undef):$Zext,
+   [{ return Helper.isConstantLegalOrBeforeLegalizer(MRI.getType(${Zext}->getOperand(0).getReg())); }]),
+   (apply [{ Helper.replaceInstWithConstant(*${Zext}, 0); }])>;
+
+def sext_undef: GICombineRule<
+   (defs root:$root),
+   (match (G_IMPLICIT_DEF $undef),
+          (G_SEXT $root, $undef):$Sext,
+   [{ return Helper.isConstantLegalOrBeforeLegalizer(MRI.getType(${Sext}->getOperand(0).getReg())); }]),
+   (apply [{ Helper.replaceInstWithConstant(*${Sext}, 0); }])>;
+
 def cast_of_cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1882,7 +1902,10 @@ def cast_combines: GICombineGroup<[
   narrow_binop_and,
   narrow_binop_or,
   narrow_binop_xor,
-  integer_of_truncate
+  integer_of_truncate,
+  anyext_undef,
+  sext_undef,
+  zext_undef
 ]>;
 
 def canonicalize_icmp : GICombineRule<
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index 3075b7ebae59e..71592058e3456 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -157,7 +157,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// conditional branch or select to create a compare with a canonical
   /// (inverted) predicate which is then more likely to be matched with other
   /// values.
-  static bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+  static bool isCanonicalPredicate(CmpPredicate Pred) {
     switch (Pred) {
     case CmpInst::ICMP_NE:
     case CmpInst::ICMP_ULE:
@@ -185,10 +185,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   }
 
   std::optional<std::pair<
-      CmpInst::Predicate,
-      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpInst::
-                                                                       Predicate
-                                                                           Pred,
+      CmpPredicate,
+      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpPredicate
+                                                                       Pred,
                                                                    Constant *C);
 
   static bool shouldAvoidAbsorbingNotIntoSelect(const SelectInst &SI) {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
index 35b3d615e3844..f6bf045f7de2c 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CFGMST.h
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/BranchProbability.h"
@@ -52,10 +53,14 @@ template <class Edge, class BBInfo> class CFGMST {
 
   BranchProbabilityInfo *const BPI;
   BlockFrequencyInfo *const BFI;
+  LoopInfo *const LI;
 
   // If function entry will be always instrumented.
   const bool InstrumentFuncEntry;
 
+  // If true loop entries will be always instrumented.
+  const bool InstrumentLoopEntries;
+
   // Find the root group of the G and compress the path from G to the root.
   BBInfo *findAndCompressGroup(BBInfo *G) {
     if (G->Group != G)
@@ -154,6 +159,16 @@ template <class Edge, class BBInfo> class CFGMST {
           }
           if (BPI != nullptr)
             Weight = BPI->getEdgeProbability(&BB, TargetBB).scale(scaleFactor);
+          // If InstrumentLoopEntries is on and the current edge leads to a loop
+          // (i.e., TargetBB is a loop head and BB is outside its loop), set
+          // Weight to be minimal, so that the edge won't be chosen for the MST
+          // and will be instrumented.
+          if (InstrumentLoopEntries && LI->isLoopHeader(TargetBB)) {
+            Loop *TargetLoop = LI->getLoopFor(TargetBB);
+            assert(TargetLoop);
+            if (!TargetLoop->contains(&BB))
+              Weight = 0;
+          }
           if (Weight == 0)
             Weight++;
           auto *E = &addEdge(&BB, TargetBB, Weight);
@@ -252,6 +267,19 @@ template <class Edge, class BBInfo> class CFGMST {
     }
   }
 
+  [[maybe_unused]] bool validateLoopEntryInstrumentation() {
+    if (!InstrumentLoopEntries)
+      return true;
+    for (auto &Ei : AllEdges) {
+      if (Ei->Removed)
+        continue;
+      if (Ei->DestBB && LI->isLoopHeader(Ei->DestBB) &&
+          !LI->getLoopFor(Ei->DestBB)->contains(Ei->SrcBB) && Ei->InMST)
+        return false;
+    }
+    return true;
+  }
+
 public:
   // Dump the Debug information about the instrumentation.
   void dumpEdges(raw_ostream &OS, const Twine &Message) const {
@@ -291,13 +319,20 @@ template <class Edge, class BBInfo> class CFGMST {
     return *AllEdges.back();
   }
 
-  CFGMST(Function &Func, bool InstrumentFuncEntry,
+  CFGMST(Function &Func, bool InstrumentFuncEntry, bool InstrumentLoopEntries,
          BranchProbabilityInfo *BPI = nullptr,
-         BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), BPI(BPI), BFI(BFI), InstrumentFuncEntry(InstrumentFuncEntry) {
+         BlockFrequencyInfo *BFI = nullptr, LoopInfo *LI = nullptr)
+      : F(Func), BPI(BPI), BFI(BFI), LI(LI),
+        InstrumentFuncEntry(InstrumentFuncEntry),
+        InstrumentLoopEntries(InstrumentLoopEntries) {
+    assert(!(InstrumentLoopEntries && !LI) &&
+           "expected a LoopInfo to instrumenting loop entries");
     buildEdges();
     sortEdgesByWeight();
     computeMinimumSpanningTree();
+    assert(validateLoopEntryInstrumentation() &&
+           "Loop entries should not be in MST when "
+           "InstrumentLoopEntries is on");
     if (AllEdges.size() > 1 && InstrumentFuncEntry)
       std::iter_swap(std::move(AllEdges.begin()),
                      std::move(AllEdges.begin() + AllEdges.size() - 1));
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 023c9de28209c..496d2958fc2d0 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -68,7 +68,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
                           BasicBlock::iterator &BBI);
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
   bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
-  bool processMemMove(MemMoveInst *M);
+  bool processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI);
   bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
                             Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
                             Align cpyAlign, BatchAAResults &BAA,
@@ -87,6 +87,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
                              AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
                              TypeSize Size, BatchAAResults &BAA);
+  bool isMemMoveMemSetDependency(MemMoveInst *M);
 
   void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 381fb7bbdb517..648a22deaf6ba 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -947,8 +947,14 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   //
   // Make sure the object has not escaped here, and then check that none of the
   // call arguments alias the object below.
+  //
+  // We model calls that can return twice (setjmp) as clobbering non-escaping
+  // objects, to model any accesses that may occur prior to the second return.
+  // As an exception, ignore allocas, as setjmp is not required to preserve
+  // non-volatile stores for them.
   if (!isa<Constant>(Object) && Call != Object &&
-      AAQI.CA->isNotCapturedBefore(Object, Call, /*OrAt*/ false)) {
+      AAQI.CA->isNotCapturedBefore(Object, Call, /*OrAt*/ false) &&
+      (isa<AllocaInst>(Object) || !Call->hasFnAttr(Attribute::ReturnsTwice))) {
 
     // Optimistically assume that call doesn't touch Object and check this
     // assumption in the following loop.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 23e11bdbeab4c..e1eb219cf977e 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1063,7 +1063,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
 SmallVector<Instruction *, 4>
 RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
   SmallVector<Instruction *, 4> ReductionOperations;
-  unsigned RedOp = getOpcode(Kind);
+  unsigned RedOp = getOpcode();
 
   // Search down from the Phi to the LoopExitInstr, looking for instructions
   // with a single user of the correct type for the reduction.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 1a5bbbc7dfceb..05e8f5761c13c 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -63,9 +63,9 @@ static Value *simplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
 static Value *simplifyBinOp(unsigned, Value *, Value *, const FastMathFlags &,
                             const SimplifyQuery &, unsigned);
-static Value *simplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
-                              unsigned);
-static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyCmpInst(CmpPredicate, Value *, Value *,
+                              const SimplifyQuery &, unsigned);
+static Value *simplifyICmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse);
 static Value *simplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *simplifyXorInst(Value *, Value *, const SimplifyQuery &,
@@ -132,8 +132,7 @@ static Constant *getFalse(Type *Ty) { return ConstantInt::getFalse(Ty); }
 static Constant *getTrue(Type *Ty) { return ConstantInt::getTrue(Ty); }
 
 /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
-static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
-                          Value *RHS) {
+static bool isSameCompare(Value *V, CmpPredicate Pred, Value *LHS, Value *RHS) {
   CmpInst *Cmp = dyn_cast<CmpInst>(V);
   if (!Cmp)
     return false;
@@ -150,10 +149,9 @@ static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
 ///  %cmp = icmp sle i32 %sel, %rhs
 /// Compose new comparison by substituting %sel with either %tv or %fv
 /// and see if it simplifies.
-static Value *simplifyCmpSelCase(CmpInst::Predicate Pred, Value *LHS,
-                                 Value *RHS, Value *Cond,
-                                 const SimplifyQuery &Q, unsigned MaxRecurse,
-                                 Constant *TrueOrFalse) {
+static Value *simplifyCmpSelCase(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                 Value *Cond, const SimplifyQuery &Q,
+                                 unsigned MaxRecurse, Constant *TrueOrFalse) {
   Value *SimplifiedCmp = simplifyCmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   if (SimplifiedCmp == Cond) {
     // %cmp simplified to the select condition (%cond).
@@ -167,18 +165,16 @@ static Value *simplifyCmpSelCase(CmpInst::Predicate Pred, Value *LHS,
 }
 
 /// Simplify comparison with true branch of select
-static Value *simplifyCmpSelTrueCase(CmpInst::Predicate Pred, Value *LHS,
-                                     Value *RHS, Value *Cond,
-                                     const SimplifyQuery &Q,
+static Value *simplifyCmpSelTrueCase(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                     Value *Cond, const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
   return simplifyCmpSelCase(Pred, LHS, RHS, Cond, Q, MaxRecurse,
                             getTrue(Cond->getType()));
 }
 
 /// Simplify comparison with false branch of select
-static Value *simplifyCmpSelFalseCase(CmpInst::Predicate Pred, Value *LHS,
-                                      Value *RHS, Value *Cond,
-                                      const SimplifyQuery &Q,
+static Value *simplifyCmpSelFalseCase(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                      Value *Cond, const SimplifyQuery &Q,
                                       unsigned MaxRecurse) {
   return simplifyCmpSelCase(Pred, LHS, RHS, Cond, Q, MaxRecurse,
                             getFalse(Cond->getType()));
@@ -471,9 +467,8 @@ static Value *threadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
 /// We can simplify %cmp1 to true, because both branches of select are
 /// less than 3. We compose new comparison by substituting %tmp with both
 /// branches of select and see if it can be simplified.
-static Value *threadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
-                                  Value *RHS, const SimplifyQuery &Q,
-                                  unsigned MaxRecurse) {
+static Value *threadCmpOverSelect(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                  const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -564,7 +559,7 @@ static Value *threadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
 /// comparison by seeing whether comparing with all of the incoming phi values
 /// yields the same result every time. If so returns the common result,
 /// otherwise returns null.
-static Value *threadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
+static Value *threadCmpOverPHI(CmpPredicate Pred, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
@@ -1001,7 +996,7 @@ Value *llvm::simplifyMulInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
 /// Given a predicate and two operands, return true if the comparison is true.
 /// This is a helper for div/rem simplification where we return some other value
 /// when we can prove a relationship between the operands.
-static bool isICmpTrue(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
+static bool isICmpTrue(CmpPredicate Pred, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q, unsigned MaxRecurse) {
   Value *V = simplifyICmpInst(Pred, LHS, RHS, Q, MaxRecurse);
   Constant *C = dyn_cast_or_null<Constant>(V);
@@ -2601,7 +2596,7 @@ static Type *getCompareTy(Value *Op) {
 /// Rummage around inside V looking for something equivalent to the comparison
 /// "LHS Pred RHS". Return such a value if found, otherwise return null.
 /// Helper function for analyzing max/min idioms.
-static Value *extractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
+static Value *extractEquivalentCondition(Value *V, CmpPredicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI)
@@ -2710,8 +2705,8 @@ static bool haveNonOverlappingStorage(const Value *V1, const Value *V2) {
 // If the C and C++ standards are ever made sufficiently restrictive in this
 // area, it may be possible to update LLVM's semantics accordingly and reinstate
 // this optimization.
-static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS,
-                                    Value *RHS, const SimplifyQuery &Q) {
+static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                    const SimplifyQuery &Q) {
   assert(LHS->getType() == RHS->getType() && "Must have same types");
   const DataLayout &DL = Q.DL;
   const TargetLibraryInfo *TLI = Q.TLI;
@@ -2859,8 +2854,8 @@ static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS,
 }
 
 /// Fold an icmp when its operands have i1 scalar type.
-static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
-                                  Value *RHS, const SimplifyQuery &Q) {
+static Value *simplifyICmpOfBools(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                  const SimplifyQuery &Q) {
   Type *ITy = getCompareTy(LHS); // The return type.
   Type *OpTy = LHS->getType();   // The operand type.
   if (!OpTy->isIntOrIntVectorTy(1))
@@ -2962,8 +2957,8 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
 }
 
 /// Try hard to fold icmp with zero RHS because this is a common case.
-static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
-                                   Value *RHS, const SimplifyQuery &Q) {
+static Value *simplifyICmpWithZero(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                   const SimplifyQuery &Q) {
   if (!match(RHS, m_Zero()))
     return nullptr;
 
@@ -3022,7 +3017,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
+static Value *simplifyICmpWithConstant(CmpPredicate Pred, Value *LHS,
                                        Value *RHS, const InstrInfoQuery &IIQ) {
   Type *ITy = getCompareTy(RHS); // The return type.
 
@@ -3115,8 +3110,8 @@ static void getUnsignedMonotonicValues(SmallPtrSetImpl<Value *> &Res, Value *V,
   }
 }
 
-static Value *simplifyICmpUsingMonotonicValues(ICmpInst::Predicate Pred,
-                                               Value *LHS, Value *RHS) {
+static Value *simplifyICmpUsingMonotonicValues(CmpPredicate Pred, Value *LHS,
+                                               Value *RHS) {
   if (Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_ULT)
     return nullptr;
 
@@ -3133,9 +3128,8 @@ static Value *simplifyICmpUsingMonotonicValues(ICmpInst::Predicate Pred,
   return nullptr;
 }
 
-static Value *simplifyICmpWithBinOpOnLHS(CmpInst::Predicate Pred,
-                                         BinaryOperator *LBO, Value *RHS,
-                                         const SimplifyQuery &Q,
+static Value *simplifyICmpWithBinOpOnLHS(CmpPredicate Pred, BinaryOperator *LBO,
+                                         Value *RHS, const SimplifyQuery &Q,
                                          unsigned MaxRecurse) {
   Type *ITy = getCompareTy(RHS); // The return type.
 
@@ -3254,8 +3248,8 @@ static Value *simplifyICmpWithBinOpOnLHS(CmpInst::Predicate Pred,
 // *) C1 < C2 && C1 >= 0, or
 // *) C2 < C1 && C1 <= 0.
 //
-static bool trySimplifyICmpWithAdds(CmpInst::Predicate Pred, Value *LHS,
-                                    Value *RHS, const InstrInfoQuery &IIQ) {
+static bool trySimplifyICmpWithAdds(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                    const InstrInfoQuery &IIQ) {
   // TODO: only support icmp slt for now.
   if (Pred != CmpInst::ICMP_SLT || !IIQ.UseInstrInfo)
     return false;
@@ -3279,8 +3273,8 @@ static bool trySimplifyICmpWithAdds(CmpInst::Predicate Pred, Value *LHS,
 /// TODO: A large part of this logic is duplicated in InstCombine's
 /// foldICmpBinOp(). We should be able to share that and avoid the code
 /// duplication.
-static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
-                                    Value *RHS, const SimplifyQuery &Q,
+static Value *simplifyICmpWithBinOp(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                    const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
@@ -3513,8 +3507,8 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
 /// simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
-static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
-                                     Value *RHS, const SimplifyQuery &Q,
+static Value *simplifyICmpWithMinMax(CmpPredicate Pred, Value *LHS, Value *RHS,
+                                     const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
   Type *ITy = getCompareTy(LHS); // The return type.
   Value *A, *B;
@@ -3698,7 +3692,7 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
-static Value *simplifyICmpWithDominatingAssume(CmpInst::Predicate Predicate,
+static Value *simplifyICmpWithDominatingAssume(CmpPredicate Predicate,
                                                Value *LHS, Value *RHS,
                                                const SimplifyQuery &Q) {
   // Gracefully handle instructions that have not been inserted yet.
@@ -3721,8 +3715,8 @@ static Value *simplifyICmpWithDominatingAssume(CmpInst::Predicate Predicate,
   return nullptr;
 }
 
-static Value *simplifyICmpWithIntrinsicOnLHS(CmpInst::Predicate Pred,
-                                             Value *LHS, Value *RHS) {
+static Value *simplifyICmpWithIntrinsicOnLHS(CmpPredicate Pred, Value *LHS,
+                                             Value *RHS) {
   auto *II = dyn_cast<IntrinsicInst>(LHS);
   if (!II)
     return nullptr;
@@ -3770,9 +3764,8 @@ static std::optional<ConstantRange> getRange(Value *V,
 
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyICmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
                                const SimplifyQuery &Q, unsigned MaxRecurse) {
-  CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
@@ -4085,17 +4078,16 @@ static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-Value *llvm::simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyICmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q) {
   return ::simplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 /// Given operands for an FCmpInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyFCmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
                                FastMathFlags FMF, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
-  CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
@@ -4320,7 +4312,7 @@ static Value *simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   return nullptr;
 }
 
-Value *llvm::simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                               FastMathFlags FMF, const SimplifyQuery &Q) {
   return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
@@ -4557,7 +4549,7 @@ static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
 }
 
 static Value *simplifyCmpSelOfMaxMin(Value *CmpLHS, Value *CmpRHS,
-                                     ICmpInst::Predicate Pred, Value *TVal,
+                                     CmpPredicate Pred, Value *TVal,
                                      Value *FVal) {
   // Canonicalize common cmp+sel operand as CmpLHS.
   if (CmpRHS == TVal || CmpRHS == FVal) {
@@ -4631,8 +4623,8 @@ static Value *simplifyCmpSelOfMaxMin(Value *CmpLHS, Value *CmpRHS,
 /// An alternative way to test if a bit is set or not uses sgt/slt instead of
 /// eq/ne.
 static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
-                                           ICmpInst::Predicate Pred,
-                                           Value *TrueVal, Value *FalseVal) {
+                                           CmpPredicate Pred, Value *TrueVal,
+                                           Value *FalseVal) {
   if (auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred))
     return simplifySelectBitTest(TrueVal, FalseVal, Res->X, &Res->Mask,
                                  Res->Pred == ICmpInst::ICMP_EQ);
@@ -6142,14 +6134,14 @@ Value *llvm::simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
-static Value *simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+static Value *simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
+  if (CmpInst::isIntPredicate(Predicate))
     return simplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
   return simplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
-Value *llvm::simplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+Value *llvm::simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
                              const SimplifyQuery &Q) {
   return ::simplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
@@ -7187,7 +7179,7 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
   case Instruction::Xor:
     return simplifyXorInst(NewOps[0], NewOps[1], Q, MaxRecurse);
   case Instruction::ICmp:
-    return simplifyICmpInst(cast<ICmpInst>(I)->getPredicate(), NewOps[0],
+    return simplifyICmpInst(cast<ICmpInst>(I)->getCmpPredicate(), NewOps[0],
                             NewOps[1], Q, MaxRecurse);
   case Instruction::FCmp:
     return simplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), NewOps[0],
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d81546d0c9fed..f2c6949e535d2 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9379,7 +9379,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
       (LPred == ICmpInst::ICMP_ULT || LPred == ICmpInst::ICMP_UGE) &&
       (RPred == ICmpInst::ICMP_ULT || RPred == ICmpInst::ICMP_UGE) &&
       match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
-    return LPred == RPred;
+    return CmpPredicate::getMatching(LPred, RPred).has_value();
 
   if (LPred == RPred)
     return isImpliedCondOperands(LPred, L0, L1, R0, R1);
@@ -9392,7 +9392,7 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
 /// expect the RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select'
 /// instruction.
 static std::optional<bool>
-isImpliedCondAndOr(const Instruction *LHS, CmpInst::Predicate RHSPred,
+isImpliedCondAndOr(const Instruction *LHS, CmpPredicate RHSPred,
                    const Value *RHSOp0, const Value *RHSOp1,
                    const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
   // The LHS must be an 'or', 'and', or a 'select' instruction.
@@ -9422,7 +9422,7 @@ isImpliedCondAndOr(const Instruction *LHS, CmpInst::Predicate RHSPred,
 }
 
 std::optional<bool>
-llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
+llvm::isImpliedCondition(const Value *LHS, CmpPredicate RHSPred,
                          const Value *RHSOp0, const Value *RHSOp1,
                          const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
   // Bail out when we hit the limit.
@@ -9476,7 +9476,7 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
 
   if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS)) {
     if (auto Implied = isImpliedCondition(
-            LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+            LHS, RHSCmp->getCmpPredicate(), RHSCmp->getOperand(0),
             RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
       return InvertRHS ? !*Implied : *Implied;
     return std::nullopt;
@@ -9553,7 +9553,7 @@ std::optional<bool> llvm::isImpliedByDomCondition(const Value *Cond,
   return std::nullopt;
 }
 
-std::optional<bool> llvm::isImpliedByDomCondition(CmpInst::Predicate Pred,
+std::optional<bool> llvm::isImpliedByDomCondition(CmpPredicate Pred,
                                                   const Value *LHS,
                                                   const Value *RHS,
                                                   const Instruction *ContextI,
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 63f4e34074e06..0444cb9e1bce5 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4900,7 +4900,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
       NameVals.push_back(*ValueId);
       assert(ModuleIdMap.count(VS->modulePath()));
       NameVals.push_back(ModuleIdMap[VS->modulePath()]);
-      NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
+      NameVals.push_back(
+          getEncodedGVSummaryFlags(VS->flags(), shouldImportValueAsDecl(VS)));
       NameVals.push_back(getEncodedGVarFlags(VS->varflags()));
       for (auto &RI : VS->refs()) {
         auto RefValueId = getValueId(RI.getGUID());
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 408395fefc298..59428818c1ee7 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -59,7 +59,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeInterleavedAccessPass(Registry);
   initializeJMCInstrumenterPass(Registry);
   initializeLiveDebugValuesPass(Registry);
-  initializeLiveDebugVariablesPass(Registry);
+  initializeLiveDebugVariablesWrapperLegacyPass(Registry);
   initializeLiveIntervalsWrapperPassPass(Registry);
   initializeLiveRangeShrinkPass(Registry);
   initializeLiveStacksPass(Registry);
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 2ff346d3fd022..317d3401f000a 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -74,24 +75,27 @@ EnableLDV("live-debug-variables", cl::init(true),
 STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
 STATISTIC(NumInsertedDebugLabels, "Number of DBG_LABELs inserted");
 
-char LiveDebugVariables::ID = 0;
+char LiveDebugVariablesWrapperLegacy::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE,
-                "Debug Variable Analysis", false, false)
+INITIALIZE_PASS_BEGIN(LiveDebugVariablesWrapperLegacy, DEBUG_TYPE,
+                      "Debug Variable Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE,
-                "Debug Variable Analysis", false, false)
+INITIALIZE_PASS_END(LiveDebugVariablesWrapperLegacy, DEBUG_TYPE,
+                    "Debug Variable Analysis", false, false)
 
-void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+void LiveDebugVariablesWrapperLegacy::getAnalysisUsage(
+    AnalysisUsage &AU) const {
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequiredTransitive<LiveIntervalsWrapperPass>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID) {
-  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+LiveDebugVariablesWrapperLegacy::LiveDebugVariablesWrapperLegacy()
+    : MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesWrapperLegacyPass(
+      *PassRegistry::getPassRegistry());
 }
 
 enum : unsigned { UndefLocNo = ~0U };
@@ -274,8 +278,6 @@ using BlockSkipInstsMap =
 
 namespace {
 
-class LDVImpl;
-
 /// A user value is a part of a debug info user variable.
 ///
 /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
@@ -285,6 +287,8 @@ class LDVImpl;
 /// user values are related if they are held by the same virtual register. The
 /// equivalence class is the transitive closure of that relation.
 class UserValue {
+  using LDVImpl = LiveDebugVariables::LDVImpl;
+
   const DILocalVariable *Variable; ///< The debug info variable we are part of.
   /// The part of the variable we describe.
   const std::optional<DIExpression::FragmentInfo> Fragment;
@@ -528,9 +532,17 @@ class UserLabel {
   void print(raw_ostream &, const TargetRegisterInfo *);
 };
 
+} // end anonymous namespace
+
+namespace llvm {
+
 /// Implementation of the LiveDebugVariables pass.
-class LDVImpl {
-  LiveDebugVariables &pass;
+
+LiveDebugVariables::LiveDebugVariables() = default;
+LiveDebugVariables::~LiveDebugVariables() = default;
+LiveDebugVariables::LiveDebugVariables(LiveDebugVariables &&) = default;
+
+class LiveDebugVariables::LDVImpl {
   LocMap::Allocator allocator;
   MachineFunction *MF = nullptr;
   LiveIntervals *LIS;
@@ -634,7 +646,7 @@ class LDVImpl {
   void computeIntervals();
 
 public:
-  LDVImpl(LiveDebugVariables *ps) : pass(*ps) {}
+  LDVImpl(LiveIntervals *LIS) : LIS(LIS) {}
 
   bool runOnMachineFunction(MachineFunction &mf, bool InstrRef);
 
@@ -671,9 +683,8 @@ class LDVImpl {
   void print(raw_ostream&);
 };
 
-} // end anonymous namespace
+} // namespace llvm
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
                           const LLVMContext &Ctx) {
   if (!DL)
@@ -753,7 +764,7 @@ void UserLabel::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
   OS << '\n';
 }
 
-void LDVImpl::print(raw_ostream &OS) {
+void LiveDebugVariables::LDVImpl::print(raw_ostream &OS) {
   OS << "********** DEBUG VARIABLES **********\n";
   for (auto &userValue : userValues)
     userValue->print(OS, TRI);
@@ -761,18 +772,16 @@ void LDVImpl::print(raw_ostream &OS) {
   for (auto &userLabel : userLabels)
     userLabel->print(OS, TRI);
 }
-#endif
 
-void UserValue::mapVirtRegs(LDVImpl *LDV) {
+void UserValue::mapVirtRegs(LiveDebugVariables::LDVImpl *LDV) {
   for (const MachineOperand &MO : locations)
     if (MO.isReg() && MO.getReg().isVirtual())
       LDV->mapVirtReg(MO.getReg(), this);
 }
 
-UserValue *
-LDVImpl::getUserValue(const DILocalVariable *Var,
-                      std::optional<DIExpression::FragmentInfo> Fragment,
-                      const DebugLoc &DL) {
+UserValue *LiveDebugVariables::LDVImpl::getUserValue(
+    const DILocalVariable *Var,
+    std::optional<DIExpression::FragmentInfo> Fragment, const DebugLoc &DL) {
   // FIXME: Handle partially overlapping fragments. See
   // https://reviews.llvm.org/D70121#1849741.
   DebugVariable ID(Var, Fragment, DL->getInlinedAt());
@@ -785,19 +794,20 @@ LDVImpl::getUserValue(const DILocalVariable *Var,
   return UV;
 }
 
-void LDVImpl::mapVirtReg(Register VirtReg, UserValue *EC) {
+void LiveDebugVariables::LDVImpl::mapVirtReg(Register VirtReg, UserValue *EC) {
   assert(VirtReg.isVirtual() && "Only map VirtRegs");
   UserValue *&Leader = virtRegToEqClass[VirtReg];
   Leader = UserValue::merge(Leader, EC);
 }
 
-UserValue *LDVImpl::lookupVirtReg(Register VirtReg) {
+UserValue *LiveDebugVariables::LDVImpl::lookupVirtReg(Register VirtReg) {
   if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
     return UV->getLeader();
   return nullptr;
 }
 
-bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
+bool LiveDebugVariables::LDVImpl::handleDebugValue(MachineInstr &MI,
+                                                   SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable, expr
   // DBG_VALUE_LIST variable, expr, locs...
   if (!MI.isDebugValue()) {
@@ -873,8 +883,8 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   return true;
 }
 
-MachineBasicBlock::iterator LDVImpl::handleDebugInstr(MachineInstr &MI,
-                                                      SlotIndex Idx) {
+MachineBasicBlock::iterator
+LiveDebugVariables::LDVImpl::handleDebugInstr(MachineInstr &MI, SlotIndex Idx) {
   assert(MI.isDebugValueLike() || MI.isDebugPHI());
 
   // In instruction referencing mode, there should be no DBG_VALUE instructions
@@ -894,7 +904,8 @@ MachineBasicBlock::iterator LDVImpl::handleDebugInstr(MachineInstr &MI,
   return NextInst;
 }
 
-bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
+bool LiveDebugVariables::LDVImpl::handleDebugLabel(MachineInstr &MI,
+                                                   SlotIndex Idx) {
   // DBG_LABEL label
   if (MI.getNumOperands() != 1 || !MI.getOperand(0).isMetadata()) {
     LLVM_DEBUG(dbgs() << "Can't handle " << MI);
@@ -917,7 +928,8 @@ bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
   return true;
 }
 
-bool LDVImpl::collectDebugValues(MachineFunction &mf, bool InstrRef) {
+bool LiveDebugVariables::LDVImpl::collectDebugValues(MachineFunction &mf,
+                                                     bool InstrRef) {
   bool Changed = false;
   for (MachineBasicBlock &MBB : mf) {
     for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
@@ -1250,7 +1262,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     I.setStopUnchecked(PrevEnd);
 }
 
-void LDVImpl::computeIntervals() {
+void LiveDebugVariables::LDVImpl::computeIntervals() {
   LexicalScopes LS;
   LS.initialize(*MF);
 
@@ -1260,10 +1272,10 @@ void LDVImpl::computeIntervals() {
   }
 }
 
-bool LDVImpl::runOnMachineFunction(MachineFunction &mf, bool InstrRef) {
+bool LiveDebugVariables::LDVImpl::runOnMachineFunction(MachineFunction &mf,
+                                                       bool InstrRef) {
   clear();
   MF = &mf;
-  LIS = &pass.getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   TRI = mf.getSubtarget().getRegisterInfo();
   LLVM_DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
                     << mf.getName() << " **********\n");
@@ -1298,31 +1310,65 @@ static void removeDebugInstrs(MachineFunction &mf) {
   }
 }
 
-bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
-  if (!EnableLDV)
-    return false;
-  if (!mf.getFunction().getSubprogram()) {
-    removeDebugInstrs(mf);
-    return false;
-  }
+bool LiveDebugVariablesWrapperLegacy::runOnMachineFunction(
+    MachineFunction &mf) {
+  auto *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
 
-  // Have we been asked to track variable locations using instruction
-  // referencing?
-  bool InstrRef = mf.useDebugInstrRef();
+  Impl = std::make_unique<LiveDebugVariables>();
+  Impl->analyze(mf, LIS);
+  return false;
+}
 
-  if (!pImpl)
-    pImpl = new LDVImpl(this);
-  return static_cast<LDVImpl *>(pImpl)->runOnMachineFunction(mf, InstrRef);
+AnalysisKey LiveDebugVariablesAnalysis::Key;
+
+LiveDebugVariables
+LiveDebugVariablesAnalysis::run(MachineFunction &MF,
+                                MachineFunctionAnalysisManager &MFAM) {
+  MFPropsModifier _(*this, MF);
+
+  auto *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  LiveDebugVariables LDV;
+  LDV.analyze(MF, LIS);
+  return LDV;
+}
+
+PreservedAnalyses
+LiveDebugVariablesPrinterPass::run(MachineFunction &MF,
+                                   MachineFunctionAnalysisManager &MFAM) {
+  auto &LDV = MFAM.getResult<LiveDebugVariablesAnalysis>(MF);
+  LDV.print(OS);
+  return PreservedAnalyses::all();
 }
 
 void LiveDebugVariables::releaseMemory() {
-  if (pImpl)
-    static_cast<LDVImpl*>(pImpl)->clear();
+  if (PImpl)
+    PImpl->clear();
 }
 
-LiveDebugVariables::~LiveDebugVariables() {
-  if (pImpl)
-    delete static_cast<LDVImpl*>(pImpl);
+bool LiveDebugVariables::invalidate(
+    MachineFunction &, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &) {
+  auto PAC = PA.getChecker<LiveDebugVariablesAnalysis>();
+  // Some architectures split the register allocation into multiple phases based
+  // on register classes. This requires preserving analyses between the phases
+  // by default.
+  return !PAC.preservedWhenStateless();
+}
+
+void LiveDebugVariables::analyze(MachineFunction &MF, LiveIntervals *LIS) {
+  if (!EnableLDV)
+    return;
+  if (!MF.getFunction().getSubprogram()) {
+    removeDebugInstrs(MF);
+    return;
+  }
+
+  PImpl.reset(new LDVImpl(LIS));
+
+  // Have we been asked to track variable locations using instruction
+  // referencing?
+  bool InstrRef = MF.useDebugInstrRef();
+  PImpl->runOnMachineFunction(MF, InstrRef);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1445,7 +1491,8 @@ UserValue::splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
   return DidChange;
 }
 
-void LDVImpl::splitPHIRegister(Register OldReg, ArrayRef<Register> NewRegs) {
+void LiveDebugVariables::LDVImpl::splitPHIRegister(Register OldReg,
+                                                   ArrayRef<Register> NewRegs) {
   auto RegIt = RegToPHIIdx.find(OldReg);
   if (RegIt == RegToPHIIdx.end())
     return;
@@ -1483,7 +1530,8 @@ void LDVImpl::splitPHIRegister(Register OldReg, ArrayRef<Register> NewRegs) {
     RegToPHIIdx[RegAndInstr.first].push_back(RegAndInstr.second);
 }
 
-void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) {
+void LiveDebugVariables::LDVImpl::splitRegister(Register OldReg,
+                                                ArrayRef<Register> NewRegs) {
   // Consider whether this split range affects any PHI locations.
   splitPHIRegister(OldReg, NewRegs);
 
@@ -1504,8 +1552,8 @@ void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) {
 
 void LiveDebugVariables::
 splitRegister(Register OldReg, ArrayRef<Register> NewRegs, LiveIntervals &LIS) {
-  if (pImpl)
-    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+  if (PImpl)
+    PImpl->splitRegister(OldReg, NewRegs);
 }
 
 void UserValue::rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
@@ -1807,7 +1855,7 @@ void UserLabel::emitDebugLabel(LiveIntervals &LIS, const TargetInstrInfo &TII,
   LLVM_DEBUG(dbgs() << '\n');
 }
 
-void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
+void LiveDebugVariables::LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   LLVM_DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
   if (!MF)
     return;
@@ -1956,13 +2004,15 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
 }
 
 void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
-  if (pImpl)
-    static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
+  if (PImpl)
+    PImpl->emitDebugValues(VRM);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void LiveDebugVariables::dump() const {
-  if (pImpl)
-    static_cast<LDVImpl*>(pImpl)->print(dbgs());
-}
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() const { print(dbgs()); }
 #endif
+
+void LiveDebugVariables::print(raw_ostream &OS) const {
+  if (PImpl)
+    PImpl->print(OS);
+}
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index d099544c2a491..03f015f8c9e32 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -559,14 +559,13 @@ class DevelopmentModeEvictionAdvisorAnalysis final
 
 float MLEvictAdvisor::getInitialQueueSize(const MachineFunction &MF) {
   auto &MRI = MF.getRegInfo();
-  float Ret = 0.0;
+  unsigned NumUsedRegs = 0;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
-    if (MRI.reg_nodbg_empty(Reg))
-      continue;
-    ++Ret;
+    if (!MRI.reg_nodbg_empty(Reg))
+      ++NumUsedRegs;
   }
-  return Ret;
+  return static_cast<float>(NumUsedRegs);
 }
 
 MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index d9e5e9d9d1e41..3a9bdde28a2e7 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -909,7 +909,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << printJumpTableEntryReference(getIndex());
     break;
   case MachineOperand::MO_GlobalAddress:
-    getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
+    if (auto *GV = getGlobal())
+      GV->printAsOperand(OS, /*PrintType=*/false, MST);
+    else // Invalid, but may appear in debugging scenarios.
+      OS << "globaladdress(null)";
+
     printOperandOffset(OS, getOffset());
     break;
   case MachineOperand::MO_ExternalSymbol: {
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index facda7a59e2f8..5ab589acee413 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -27,6 +27,8 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -93,13 +95,19 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
     return 0;
   case MachineOperand::MO_GlobalAddress: {
     const GlobalValue *GV = MO.getGlobal();
-    if (!GV->hasName()) {
-      ++StableHashBailingGlobalAddress;
-      return 0;
+    stable_hash GVHash = 0;
+    if (auto *GVar = dyn_cast<GlobalVariable>(GV))
+      GVHash = StructuralHash(*GVar);
+    if (!GVHash) {
+      if (!GV->hasName()) {
+        ++StableHashBailingGlobalAddress;
+        return 0;
+      }
+      GVHash = stable_hash_name(GV->getName());
     }
-    auto Name = GV->getName();
-    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
-                               stable_hash_name(Name), MO.getOffset());
+
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(), GVHash,
+                               MO.getOffset());
   }
 
   case MachineOperand::MO_TargetIndex: {
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 55d806e768b91..7ee24c960dbe0 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -130,7 +130,7 @@ char &llvm::RABasicID = RABasic::ID;
 
 INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
@@ -180,8 +180,8 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LiveIntervalsWrapperPass>();
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addPreserved<SlotIndexesWrapperPass>();
-  AU.addRequired<LiveDebugVariables>();
-  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveDebugVariablesWrapperLegacy>();
+  AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index d0d2c585f0b54..8564fd8ca96da 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -151,7 +151,7 @@ char &llvm::RAGreedyID = RAGreedy::ID;
 
 INITIALIZE_PASS_BEGIN(RAGreedy, "greedy",
                 "Greedy Register Allocator", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
@@ -204,8 +204,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addRequired<SlotIndexesWrapperPass>();
   AU.addPreserved<SlotIndexesWrapperPass>();
-  AU.addRequired<LiveDebugVariables>();
-  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveDebugVariablesWrapperLegacy>();
+  AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
@@ -2732,7 +2732,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
   SpillPlacer = &getAnalysis<SpillPlacementWrapperLegacy>().getResult();
-  DebugVars = &getAnalysis<LiveDebugVariables>();
+  DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
 
   initializeCSRCost();
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 9578b8d3bef87..594c481826cf0 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -42,7 +43,7 @@ namespace llvm {
 class AllocationOrder;
 class AnalysisUsage;
 class EdgeBundles;
-class LiveDebugVariables;
+class LiveDebugVariablesWrapperLegacy;
 class LiveIntervals;
 class LiveRegMatrix;
 class MachineBasicBlock;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6c8e9969784c9..6435a2119077f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1220,8 +1220,11 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
 
     if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
-      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
+      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1})) {
+        NewFlags.setDisjoint(Flags.hasDisjoint() &&
+                             N0->getFlags().hasDisjoint());
         return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
+      }
       return SDValue();
     }
     if (TLI.isReassocProfitable(DAG, N0, N1)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index eede879e7e80d..d5551758c073e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1851,11 +1851,19 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return false;
   }
 
-  case Instruction::Unreachable:
-    if (TM.Options.TrapUnreachable)
+  case Instruction::Unreachable: {
+    if (TM.Options.TrapUnreachable) {
+      if (TM.Options.NoTrapAfterNoreturn) {
+        const auto *Call =
+            dyn_cast_or_null<CallInst>(cast<Instruction>(I)->getPrevNode());
+        if (Call && Call->doesNotReturn())
+          return true;
+      }
+
       return fastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
-    else
-      return true;
+    }
+    return true;
+  }
 
   case Instruction::Alloca:
     // FunctionLowering has the static-sized case covered.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 63536336e9622..ca87168929f96 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2104,7 +2104,7 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
+  bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, isSigned);
   CLI.setDebugLoc(SDLoc(Node))
       .setChain(InChain)
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
@@ -2135,7 +2135,7 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
+    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, isSigned);
     Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
   }
@@ -4794,7 +4794,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                              NVT, Node->getOperand(IsStrict ? 1 : 0));
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setSExt(Signed);
+    CallOptions.setIsSigned(Signed);
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain);
     Results.push_back(Tmp.first);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 3f8d117400efd..b52c2c07a7fba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1044,7 +1044,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                            NVT, N->getOperand(IsStrict ? 1 : 0));
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(Signed);
+  CallOptions.setIsSigned(Signed);
   CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
   std::pair<SDValue, SDValue> Tmp =
       TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
@@ -2099,7 +2099,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setSExt(true);
+    CallOptions.setIsSigned(true);
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
     if (Strict)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 493abfde148c6..986d69e6c7a9e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2601,7 +2601,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ExpOp(SDNode *N) {
              N->getOperand(1 + OpOffset).getValueType().getSizeInBits() &&
          "POWI exponent should match with sizeof(int) when doing the libcall.");
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   SDValue Ops[2] = {N->getOperand(0 + OpOffset), N->getOperand(1 + OpOffset)};
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
       DAG, LC, N->getValueType(0), Ops, CallOptions, SDLoc(N), Chain);
@@ -4006,7 +4006,7 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_XINT(SDNode *N, SDValue &Lo,
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
     CallOptions.setTypeListBeforeSoften(OpVT, VT);
   else
-    CallOptions.setSExt(true);
+    CallOptions.setIsSigned(true); // FIXME: Is this needed?
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, VT, Op,
                                                     CallOptions, dl, Chain);
   SplitInteger(Tmp.first, Lo, Hi);
@@ -4098,7 +4098,7 @@ void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo,
   EVT RetVT = N->getValueType(0);
 
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
                                                     Op, CallOptions, dl,
                                                     Chain);
@@ -4269,7 +4269,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   // upper half of the result if it exceeds VT.
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first,
                Lo, Hi);
 }
@@ -4640,7 +4640,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
@@ -4880,7 +4880,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     SDValue ShAmt = DAG.getZExtOrTrunc(N->getOperand(1), dl, ShAmtTy);
     SDValue Ops[2] = {N->getOperand(0), ShAmt};
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setSExt(isSigned);
+    CallOptions.setIsSigned(isSigned);
     SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
     return;
   }
@@ -4970,7 +4970,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
@@ -5659,7 +5659,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_XINT_TO_FP(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this XINT_TO_FP!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(true);
+  CallOptions.setIsSigned(true);
   std::pair<SDValue, SDValue> Tmp =
       TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, SDLoc(N), Chain);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a38a3e9b91052..b72c5eff22f18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1896,6 +1896,18 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
                          DAG.getConstant(0, getCurSDLoc(), MVT::nxv16i1));
     }
 
+    if (VT.isRISCVVectorTuple()) {
+      assert(C->isNullValue() && "Can only zero this target type!");
+      return NodeMap[V] = DAG.getNode(
+                 ISD::BITCAST, getCurSDLoc(), VT,
+                 DAG.getNode(
+                     ISD::SPLAT_VECTOR, getCurSDLoc(),
+                     EVT::getVectorVT(*DAG.getContext(), MVT::i8,
+                                      VT.getSizeInBits().getKnownMinValue() / 8,
+                                      true),
+                     DAG.getConstant(0, getCurSDLoc(), MVT::getIntegerVT(8))));
+    }
+
     VectorType *VecTy = cast<VectorType>(V->getType());
 
     // Now that we know the number and type of the elements, get that number of
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bd4bcadb57d7a..eeba4b7d20f9c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -159,8 +159,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
     SDValue NewOp = Ops[i];
     Entry.Node = NewOp;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.IsSExt = shouldSignExtendTypeInLibCall(NewOp.getValueType(),
-                                                 CallOptions.IsSExt);
+    Entry.IsSExt =
+        shouldSignExtendTypeInLibCall(Entry.Ty, CallOptions.IsSigned);
     Entry.IsZExt = !Entry.IsSExt;
 
     if (CallOptions.IsSoften &&
@@ -177,7 +177,7 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
-  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, CallOptions.IsSExt);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetTy, CallOptions.IsSigned);
   bool zeroExtend = !signExtend;
 
   if (CallOptions.IsSoften &&
@@ -3736,6 +3736,15 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       KnownUndef.clearAllBits();
     }
     break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+                                   KnownZero, TLO, Depth + 1))
+      return true;
+    // Don't fall through to generic undef -> undef handling.
+    return false;
   default: {
     if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
       if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
@@ -10876,7 +10885,7 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
     // Attempt a libcall.
     SDValue Ret;
     TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setSExt(Signed);
+    CallOptions.setIsSigned(Signed);
     CallOptions.setIsPostTypeLegalization(true);
     if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
       // Halves of WideVT are packed into registers in different order
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index a1fa266354a52..cdc530621de62 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -159,7 +159,7 @@ namespace {
       // may be invoked multiple times requiring it to save these analyses to be
       // used by RA later.
       AU.addPreserved<LiveIntervalsWrapperPass>();
-      AU.addPreserved<LiveDebugVariables>();
+      AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
 
       MachineFunctionPass::getAnalysisUsage(AU);
     }
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 26a12512c87be..2084e68c16e29 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -251,7 +251,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
                       "Virtual Register Rewriter", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
@@ -264,14 +264,14 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveIntervalsWrapperPass>();
   AU.addRequired<SlotIndexesWrapperPass>();
   AU.addPreserved<SlotIndexesWrapperPass>();
-  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveDebugVariablesWrapperLegacy>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
   AU.addRequired<VirtRegMapWrapperLegacy>();
   AU.addRequired<LiveRegMatrixWrapperLegacy>();
 
   if (!ClearVirtRegs)
-    AU.addPreserved<LiveDebugVariables>();
+    AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
 
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -285,7 +285,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
   LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
-  DebugVars = &getAnalysis<LiveDebugVariables>();
+  DebugVars = &getAnalysis<LiveDebugVariablesWrapperLegacy>().getLDV();
   LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
                     << "********** Function: " << MF->getName() << '\n');
   LLVM_DEBUG(VRM->dump());
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 9041dc3a52dcf..2da3750b4ed25 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -85,6 +85,8 @@ const char *getScopeName(Scope S) {
     return "default";
   case Scope::Hidden:
     return "hidden";
+  case Scope::SideEffectsOnly:
+    return "side-effects-only";
   case Scope::Local:
     return "local";
   }
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 254c04b198612..e5dbb7ee0510a 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -253,7 +253,9 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
       }
       switch (Sym->getScope()) {
       case Scope::Local:
-        llvm_unreachable("External symbol should not have local linkage");
+      case Scope::SideEffectsOnly:
+        llvm_unreachable("External symbol should not have local or "
+                         "side-effects-only linkage");
       case Scope::Hidden:
         break;
       case Scope::Default:
diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
index 007e18e307399..a6c1d1ac632a0 100644
--- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
@@ -862,9 +862,9 @@ Error COFFPlatform::COFFPlatformPlugin::preserveInitializerSections(
       // to the first block.
       if (!InitSym) {
         auto &B = **InitSection.blocks().begin();
-        InitSym = &G.addDefinedSymbol(B, 0, *InitSymName, B.getSize(),
-                                      jitlink::Linkage::Strong,
-                                      jitlink::Scope::Default, false, true);
+        InitSym = &G.addDefinedSymbol(
+            B, 0, *InitSymName, B.getSize(), jitlink::Linkage::Strong,
+            jitlink::Scope::SideEffectsOnly, false, true);
       }
 
       // Add keep-alive edges to anonymous symbols in all other init blocks.
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 222135bd77688..3547eabdd0ae7 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ExecutionEngine/Orc/Shared/OrcError.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MSVCErrorWorkarounds.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <condition_variable>
 #include <future>
@@ -938,7 +939,6 @@ Error JITDylib::resolve(MaterializationResponsibility &MR,
           auto &MI = MII->second;
           for (auto &Q : MI.takeQueriesMeeting(SymbolState::Resolved)) {
             Q->notifySymbolMetRequiredState(Name, ResolvedSym);
-            Q->removeQueryDependence(*this, Name);
             if (Q->isComplete())
               CompletedQueries.insert(std::move(Q));
           }
@@ -1207,9 +1207,8 @@ void JITDylib::MaterializingInfo::removeQuery(
       PendingQueries, [&Q](const std::shared_ptr<AsynchronousSymbolQuery> &V) {
         return V.get() == &Q;
       });
-  assert(I != PendingQueries.end() &&
-         "Query is not attached to this MaterializingInfo");
-  PendingQueries.erase(I);
+  if (I != PendingQueries.end())
+    PendingQueries.erase(I);
 }
 
 JITDylib::AsynchronousSymbolQueryList
@@ -2615,6 +2614,12 @@ void ExecutionSession::OL_completeLookup(
               LLVM_DEBUG(dbgs()
                          << "matched, symbol already in required state\n");
               Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
+
+              // If this symbol is in anything other than the Ready state then
+              // we need to track the dependence.
+              if (SymI->second.getState() != SymbolState::Ready)
+                Q->addQueryDependence(JD, Name);
+
               return true;
             }
 
@@ -3165,7 +3170,6 @@ void ExecutionSession::IL_makeEDUEmitted(
       Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol());
       if (Q->isComplete())
         Queries.insert(Q);
-      Q->removeQueryDependence(JD, SymbolStringPtr(Sym));
     }
   }
 
@@ -3308,7 +3312,7 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR,
           continue;
         }
 
-        // If we get here thene Dep is Emitted. We need to look up its defining
+        // If we get here then Dep is Emitted. We need to look up its defining
         // EDU and add this EDU to the defining EDU's list of users (this means
         // creating an EDUInfos entry if the defining EDU doesn't have one
         // already).
@@ -3317,8 +3321,6 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR,
         auto &DepMI = DepJD->MaterializingInfos[SymbolStringPtr(Dep)];
         assert(DepMI.DefiningEDU &&
                "Emitted symbol does not have a defining EDU");
-        assert(!DepMI.DefiningEDU->Dependencies.empty() &&
-               "Emitted symbol has empty dependencies (should be ready)");
         assert(DepMI.DependantEDUs.empty() &&
                "Already-emitted symbol has dependant EDUs?");
         auto &DepEDUInfo = EDUInfos[DepMI.DefiningEDU.get()];
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
index de8d003408871..6ced8c76b037c 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
@@ -137,14 +137,6 @@ struct PrintSymbolMapElemsMatchingCLOpts {
 namespace llvm {
 namespace orc {
 
-raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
-  return OS << *Sym;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, NonOwningSymbolStringPtr Sym) {
-  return OS << *Sym;
-}
-
 raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
   return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 431c64996b2c5..c3a217a802cb7 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -894,9 +894,9 @@ Error ELFNixPlatform::ELFNixPlatformPlugin::preserveInitSections(
       // to the first block.
       if (!InitSym) {
         auto &B = **InitSection.blocks().begin();
-        InitSym = &G.addDefinedSymbol(B, 0, *InitSymName, B.getSize(),
-                                      jitlink::Linkage::Strong,
-                                      jitlink::Scope::Default, false, true);
+        InitSym = &G.addDefinedSymbol(
+            B, 0, *InitSymName, B.getSize(), jitlink::Linkage::Strong,
+            jitlink::Scope::SideEffectsOnly, false, true);
       }
 
       // Add keep-alive edges to anonymous symbols in all other init blocks.
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 3e02beb0baa86..1b18a4d0596c1 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1001,9 +1001,9 @@ Error MachOPlatform::MachOPlatformPlugin::preserveImportantSections(
       // to the first block.
       if (!InitSym) {
         auto &B = **InitSection->blocks().begin();
-        InitSym = &G.addDefinedSymbol(B, 0, *InitSymName, B.getSize(),
-                                      jitlink::Linkage::Strong,
-                                      jitlink::Scope::Default, false, true);
+        InitSym = &G.addDefinedSymbol(
+            B, 0, *InitSymName, B.getSize(), jitlink::Linkage::Strong,
+            jitlink::Scope::SideEffectsOnly, false, true);
       }
 
       // Add keep-alive edges to anonymous symbols in all other init blocks.
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index c1c55408c7858..c5342c4f4deb3 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -65,6 +65,8 @@ JITSymbolFlags getJITSymbolFlagsForSymbol(Symbol &Sym) {
 
   if (Sym.getScope() == Scope::Default)
     Flags |= JITSymbolFlags::Exported;
+  else if (Sym.getScope() == Scope::SideEffectsOnly)
+    Flags |= JITSymbolFlags::MaterializationSideEffectsOnly;
 
   if (Sym.isCallable())
     Flags |= JITSymbolFlags::Callable;
@@ -236,7 +238,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
 
     SymbolMap InternedResult;
     for (auto *Sym : G.defined_symbols())
-      if (Sym->getScope() != Scope::Local) {
+      if (Sym->getScope() < Scope::SideEffectsOnly) {
         auto InternedName = ES.intern(Sym->getName());
         auto Ptr = getJITSymbolPtrForSymbol(*Sym, G.getTargetTriple());
         auto Flags = getJITSymbolFlagsForSymbol(*Sym);
@@ -249,7 +251,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
       }
 
     for (auto *Sym : G.absolute_symbols())
-      if (Sym->getScope() != Scope::Local) {
+      if (Sym->getScope() < Scope::SideEffectsOnly) {
         auto InternedName = ES.intern(Sym->getName());
         auto Ptr = getJITSymbolPtrForSymbol(*Sym, G.getTargetTriple());
         auto Flags = getJITSymbolFlagsForSymbol(*Sym);
@@ -281,11 +283,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
         // If this is a materialization-side-effects only symbol then bump
         // the counter and remove in from the result, otherwise make sure that
         // it's defined.
-        if (Flags.hasMaterializationSideEffectsOnly()) {
+        if (Flags.hasMaterializationSideEffectsOnly())
           ++NumMaterializationSideEffectsOnlySymbols;
-          InternedResult.erase(Sym);
-          continue;
-        } else if (I == InternedResult.end())
+        else if (I == InternedResult.end())
           MissingSymbols.push_back(Sym);
         else if (Layer.OverrideObjectFlags)
           I->second.setFlags(Flags);
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/Shared/CMakeLists.txt
index f4e4a6b4f53fc..792b0cc8251cc 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMOrcShared
   OrcError.cpp
   OrcRTBridge.cpp
   SimpleRemoteEPCUtils.cpp
+  SymbolStringPool.cpp
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ExecutionEngine/Orc
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/SymbolStringPool.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/SymbolStringPool.cpp
new file mode 100644
index 0000000000000..9ca4e59288ecf
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/SymbolStringPool.cpp
@@ -0,0 +1,18 @@
+//===------- SymbolStringPool.cpp - SymbolStringPool implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm::orc {
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtrBase &Sym) {
+  return OS << Sym.S->first();
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 065ce3a017283..4f07a4c4dd017 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3842,9 +3842,8 @@ std::optional<bool> ICmpInst::compare(const KnownBits &LHS,
 }
 
 CmpInst::Predicate ICmpInst::getFlippedSignednessPredicate(Predicate pred) {
-  assert(CmpInst::isRelational(pred) &&
-         "Call only with non-equality predicates!");
-
+  if (CmpInst::isEquality(pred))
+    return pred;
   if (isSigned(pred))
     return getUnsignedPredicate(pred);
   if (isUnsigned(pred))
@@ -3916,6 +3915,23 @@ bool CmpInst::isImpliedFalseByMatchingCmp(Predicate Pred1, Predicate Pred2) {
   return isImpliedTrueByMatchingCmp(Pred1, getInversePredicate(Pred2));
 }
 
+//===----------------------------------------------------------------------===//
+//                       CmpPredicate Implementation
+//===----------------------------------------------------------------------===//
+
+std::optional<CmpPredicate> CmpPredicate::getMatching(CmpPredicate A,
+                                                      CmpPredicate B) {
+  if (A.Pred == B.Pred)
+    return A.HasSameSign == B.HasSameSign ? A : CmpPredicate(A.Pred);
+  if (A.HasSameSign &&
+      A.Pred == ICmpInst::getFlippedSignednessPredicate(B.Pred))
+    return B.Pred;
+  if (B.HasSameSign &&
+      B.Pred == ICmpInst::getFlippedSignednessPredicate(A.Pred))
+    return A.Pred;
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 //                        SwitchInst Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 12a558b3bc1b1..d9024b0a8673f 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -328,6 +328,13 @@ void ModuleSummaryIndex::propagateAttributes(
 
 bool ModuleSummaryIndex::canImportGlobalVar(const GlobalValueSummary *S,
                                             bool AnalyzeRefs) const {
+  bool CanImportDecl;
+  return canImportGlobalVar(S, AnalyzeRefs, CanImportDecl);
+}
+
+bool ModuleSummaryIndex::canImportGlobalVar(const GlobalValueSummary *S,
+                                            bool AnalyzeRefs,
+                                            bool &CanImportDecl) const {
   auto HasRefsPreventingImport = [this](const GlobalVarSummary *GVS) {
     // We don't analyze GV references during attribute propagation, so
     // GV with non-trivial initializer can be marked either read or
@@ -348,13 +355,20 @@ bool ModuleSummaryIndex::canImportGlobalVar(const GlobalValueSummary *S,
   };
   auto *GVS = cast<GlobalVarSummary>(S->getBaseObject());
 
+  const bool nonInterposable =
+      !GlobalValue::isInterposableLinkage(S->linkage());
+  const bool eligibleToImport = !S->notEligibleToImport();
+
+  // It's correct to import a global variable only when it is not interposable
+  // and eligible to import.
+  CanImportDecl = (nonInterposable && eligibleToImport);
+
   // Global variable with non-trivial initializer can be imported
   // if it's readonly. This gives us extra opportunities for constant
   // folding and converting indirect calls to direct calls. We don't
   // analyze GV references during attribute propagation, because we
   // don't know yet if it is readonly or not.
-  return !GlobalValue::isInterposableLinkage(S->linkage()) &&
-         !S->notEligibleToImport() &&
+  return nonInterposable && eligibleToImport &&
          (!AnalyzeRefs || !HasRefsPreventingImport(GVS));
 }
 
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
index ccc534a890419..1c617c100c7dc 100644
--- a/llvm/lib/IR/StructuralHash.cpp
+++ b/llvm/lib/IR/StructuralHash.cpp
@@ -46,7 +46,7 @@ class StructuralHashImpl {
   /// Assign a unique ID to each Value in the order they are first seen.
   DenseMap<const Value *, int> ValueToId;
 
-  stable_hash hashType(Type *ValueType) {
+  static stable_hash hashType(Type *ValueType) {
     SmallVector<stable_hash> Hashes;
     Hashes.emplace_back(ValueType->getTypeID());
     if (ValueType->isIntegerTy())
@@ -65,7 +65,7 @@ class StructuralHashImpl {
     }
   }
 
-  stable_hash hashAPInt(const APInt &I) {
+  static stable_hash hashAPInt(const APInt &I) {
     SmallVector<stable_hash> Hashes;
     Hashes.emplace_back(I.getBitWidth());
     auto RawVals = ArrayRef<uint64_t>(I.getRawData(), I.getNumWords());
@@ -73,11 +73,39 @@ class StructuralHashImpl {
     return stable_hash_combine(Hashes);
   }
 
-  stable_hash hashAPFloat(const APFloat &F) {
+  static stable_hash hashAPFloat(const APFloat &F) {
     return hashAPInt(F.bitcastToAPInt());
   }
 
-  stable_hash hashGlobalValue(const GlobalValue *GV) {
+  static stable_hash hashGlobalVariable(const GlobalVariable &GVar) {
+    if (!GVar.hasInitializer())
+      return hashGlobalValue(&GVar);
+
+    // Hash the contents of a string.
+    if (GVar.getName().starts_with(".str")) {
+      auto *C = GVar.getInitializer();
+      if (const auto *Seq = dyn_cast<ConstantDataSequential>(C))
+        if (Seq->isString())
+          return stable_hash_name(Seq->getAsString());
+    }
+
+    // Hash structural contents of Objective-C metadata in specific sections.
+    // This can be extended to other metadata if needed.
+    static constexpr const char *SectionNames[] = {
+        "__cfstring",      "__cstring",      "__objc_classrefs",
+        "__objc_methname", "__objc_selrefs",
+    };
+    if (GVar.hasSection()) {
+      StringRef SectionName = GVar.getSection();
+      for (const char *Name : SectionNames)
+        if (SectionName.contains(Name))
+          return hashConstant(GVar.getInitializer());
+    }
+
+    return hashGlobalValue(&GVar);
+  }
+
+  static stable_hash hashGlobalValue(const GlobalValue *GV) {
     if (!GV->hasName())
       return 0;
     return stable_hash_name(GV->getName());
@@ -87,7 +115,7 @@ class StructuralHashImpl {
   // FunctionComparator::cmpConstants() in FunctionComparator.cpp, but here
   // we're interested in computing a hash rather than comparing two Constants.
   // Some of the logic is simplified, e.g, we don't expand GEPOperator.
-  stable_hash hashConstant(Constant *C) {
+  static stable_hash hashConstant(const Constant *C) {
     SmallVector<stable_hash> Hashes;
 
     Type *Ty = C->getType();
@@ -98,14 +126,21 @@ class StructuralHashImpl {
       return stable_hash_combine(Hashes);
     }
 
+    if (auto *GVar = dyn_cast<GlobalVariable>(C)) {
+      Hashes.emplace_back(hashGlobalVariable(*GVar));
+      return stable_hash_combine(Hashes);
+    }
+
     if (auto *G = dyn_cast<GlobalValue>(C)) {
       Hashes.emplace_back(hashGlobalValue(G));
       return stable_hash_combine(Hashes);
     }
 
     if (const auto *Seq = dyn_cast<ConstantDataSequential>(C)) {
-      Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues()));
-      return stable_hash_combine(Hashes);
+      if (Seq->isString()) {
+        Hashes.emplace_back(stable_hash_name(Seq->getAsString()));
+        return stable_hash_combine(Hashes);
+      }
     }
 
     switch (C->getValueID()) {
@@ -297,6 +332,10 @@ stable_hash llvm::StructuralHash(const Function &F, bool DetailedHash) {
   return H.getHash();
 }
 
+stable_hash llvm::StructuralHash(const GlobalVariable &GVar) {
+  return StructuralHashImpl::hashGlobalVariable(GVar);
+}
+
 stable_hash llvm::StructuralHash(const Module &M, bool DetailedHash) {
   StructuralHashImpl H(DetailedHash);
   H.update(M);
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index ac6b8b4c19700..ffa80faf6e249 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -990,7 +990,7 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
         Ty->getIntParameter(0);
     return TargetTypeInfo(
         ScalableVectorType::get(Type::getInt8Ty(C), TotalNumElts),
-        TargetExtType::CanBeLocal);
+        TargetExtType::CanBeLocal, TargetExtType::HasZeroInit);
   }
 
   // DirectX resources
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index ba52a37df9c25..cc9f59727c604 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -98,6 +98,7 @@
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveVariables.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 7c3798f6462a4..772ec5fd10e63 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -89,7 +89,7 @@ MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfilingLoweringPass())
 MODULE_PASS("ctx-instr-lower", PGOCtxProfLoweringPass())
-MODULE_PASS("print<ctx-prof-analysis>", CtxProfAnalysisPrinterPass(dbgs()))
+MODULE_PASS("print<ctx-prof-analysis>", CtxProfAnalysisPrinterPass(errs()))
 MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
@@ -116,21 +116,21 @@ MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass(TM))
-MODULE_PASS("print", PrintModulePass(dbgs()))
-MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
-MODULE_PASS("print-callgraph-sccs", CallGraphSCCsPrinterPass(dbgs()))
-MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs()))
-MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(dbgs()))
-MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(dbgs()))
+MODULE_PASS("print", PrintModulePass(errs()))
+MODULE_PASS("print-callgraph", CallGraphPrinterPass(errs()))
+MODULE_PASS("print-callgraph-sccs", CallGraphSCCsPrinterPass(errs()))
+MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(errs()))
+MODULE_PASS("print-lcg", LazyCallGraphPrinterPass(errs()))
+MODULE_PASS("print-lcg-dot", LazyCallGraphDOTPrinterPass(errs()))
 MODULE_PASS("print-must-be-executed-contexts",
-            MustBeExecutedContextPrinterPass(dbgs()))
-MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(dbgs()))
-MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(dbgs()))
-MODULE_PASS("print<dxil-metadata>", DXILMetadataAnalysisPrinterPass(dbgs()))
-MODULE_PASS("print<dxil-resource>", DXILResourcePrinterPass(dbgs()))
-MODULE_PASS("print<inline-advisor>", InlineAdvisorAnalysisPrinterPass(dbgs()))
-MODULE_PASS("print<module-debuginfo>", ModuleDebugInfoPrinterPass(dbgs()))
-MODULE_PASS("print<reg-usage>", PhysicalRegisterUsageInfoPrinterPass(dbgs()))
+            MustBeExecutedContextPrinterPass(errs()))
+MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(errs()))
+MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(errs()))
+MODULE_PASS("print<dxil-metadata>", DXILMetadataAnalysisPrinterPass(errs()))
+MODULE_PASS("print<dxil-resource>", DXILResourcePrinterPass(errs()))
+MODULE_PASS("print<inline-advisor>", InlineAdvisorAnalysisPrinterPass(errs()))
+MODULE_PASS("print<module-debuginfo>", ModuleDebugInfoPrinterPass(errs()))
+MODULE_PASS("print<reg-usage>", PhysicalRegisterUsageInfoPrinterPass(errs()))
 MODULE_PASS("pseudo-probe", SampleProfileProbePass(TM))
 MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
 MODULE_PASS("recompute-globalsaa", RecomputeGlobalsAAPass())
@@ -225,7 +225,7 @@ MODULE_PASS_WITH_PARAMS(
 MODULE_PASS_WITH_PARAMS(
     "print<structural-hash>", "StructuralHashPrinterPass",
     [](StructuralHashOptions Options) {
-      return StructuralHashPrinterPass(dbgs(), Options);
+      return StructuralHashPrinterPass(errs(), Options);
     },
     parseStructuralHashPrinterPassOptions, "detailed;call-target-ignored")
 
@@ -424,38 +424,38 @@ FUNCTION_PASS("pa-eval", PAEvalPass())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("place-safepoints", PlaceSafepointsPass())
-FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
+FUNCTION_PASS("print", PrintFunctionPass(errs()))
 // TODO: rename to print<foo> after NPM switch
-FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(dbgs()))
-FUNCTION_PASS("print-cfg-sccs", CFGSCCPrinterPass(dbgs()))
-FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(dbgs()))
-FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(dbgs()))
-FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs()))
-FUNCTION_PASS("print<access-info>", LoopAccessInfoPrinterPass(dbgs()))
-FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
-FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
-FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
-FUNCTION_PASS("print<cost-model>", CostModelPrinterPass(dbgs()))
-FUNCTION_PASS("print<cycles>", CycleInfoPrinterPass(dbgs()))
-FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
-FUNCTION_PASS("print<debug-ata>", DebugAssignmentTrackingPrinterPass(dbgs()))
-FUNCTION_PASS("print<delinearization>", DelinearizationPrinterPass(dbgs()))
-FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
-FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
-FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
-FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(dbgs()))
-FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(dbgs()))
+FUNCTION_PASS("print-alias-sets", AliasSetsPrinterPass(errs()))
+FUNCTION_PASS("print-cfg-sccs", CFGSCCPrinterPass(errs()))
+FUNCTION_PASS("print-memderefs", MemDerefPrinterPass(errs()))
+FUNCTION_PASS("print-mustexecute", MustExecutePrinterPass(errs()))
+FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(errs()))
+FUNCTION_PASS("print<access-info>", LoopAccessInfoPrinterPass(errs()))
+FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(errs()))
+FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(errs()))
+FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(errs()))
+FUNCTION_PASS("print<cost-model>", CostModelPrinterPass(errs()))
+FUNCTION_PASS("print<cycles>", CycleInfoPrinterPass(errs()))
+FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(errs()))
+FUNCTION_PASS("print<debug-ata>", DebugAssignmentTrackingPrinterPass(errs()))
+FUNCTION_PASS("print<delinearization>", DelinearizationPrinterPass(errs()))
+FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(errs()))
+FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(errs()))
+FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(errs()))
+FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(errs()))
+FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(errs()))
 FUNCTION_PASS("print<inliner-size-estimator>",
-              InlineSizeEstimatorAnalysisPrinterPass(dbgs()))
-FUNCTION_PASS("print<lazy-value-info>", LazyValueInfoPrinterPass(dbgs()))
-FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
-FUNCTION_PASS("print<memoryssa-walker>", MemorySSAWalkerPrinterPass(dbgs()))
-FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
-FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
-FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
-FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
-FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(dbgs()))
-FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(dbgs()))
+              InlineSizeEstimatorAnalysisPrinterPass(errs()))
+FUNCTION_PASS("print<lazy-value-info>", LazyValueInfoPrinterPass(errs()))
+FUNCTION_PASS("print<loops>", LoopPrinterPass(errs()))
+FUNCTION_PASS("print<memoryssa-walker>", MemorySSAWalkerPrinterPass(errs()))
+FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(errs()))
+FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(errs()))
+FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(errs()))
+FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(errs()))
+FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(errs()))
+FUNCTION_PASS("print<uniformity>", UniformityInfoPrinterPass(errs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
 FUNCTION_PASS("reg2mem", RegToMemPass())
@@ -562,19 +562,19 @@ FUNCTION_PASS_WITH_PARAMS(
 FUNCTION_PASS_WITH_PARAMS(
     "print<da>", "DependenceAnalysisPrinterPass",
     [](bool NormalizeResults) {
-      return DependenceAnalysisPrinterPass(dbgs(), NormalizeResults);
+      return DependenceAnalysisPrinterPass(errs(), NormalizeResults);
     },
     parseDependenceAnalysisPrinterOptions, "normalized-results")
 FUNCTION_PASS_WITH_PARAMS(
     "print<memoryssa>", "MemorySSAPrinterPass",
     [](bool NoEnsureOptimizedUses) {
-      return MemorySSAPrinterPass(dbgs(), !NoEnsureOptimizedUses);
+      return MemorySSAPrinterPass(errs(), !NoEnsureOptimizedUses);
     },
     parseMemorySSAPrinterPassOptions, "no-ensure-optimized-uses")
 FUNCTION_PASS_WITH_PARAMS(
     "print<stack-lifetime>", "StackLifetimePrinterPass",
     [](StackLifetime::LivenessType Type) {
-      return StackLifetimePrinterPass(dbgs(), Type);
+      return StackLifetimePrinterPass(errs(), Type);
     },
     parseStackLifetimeOptions, "may;must")
 FUNCTION_PASS_WITH_PARAMS(
@@ -664,11 +664,11 @@ LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
 LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
 LOOP_PASS("no-op-loop", NoOpLoopPass())
-LOOP_PASS("print", PrintLoopPass(dbgs()))
-LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
-LOOP_PASS("print<iv-users>", IVUsersPrinterPass(dbgs()))
-LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
-LOOP_PASS("print<loopnest>", LoopNestPrinterPass(dbgs()))
+LOOP_PASS("print", PrintLoopPass(errs()))
+LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(errs()))
+LOOP_PASS("print<iv-users>", IVUsersPrinterPass(errs()))
+LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(errs()))
+LOOP_PASS("print<loopnest>", LoopNestPrinterPass(errs()))
 #undef LOOP_PASS
 
 #ifndef LOOP_PASS_WITH_PARAMS
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 7663852236594..dad79b2c1761e 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -52,6 +52,9 @@ static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   if (Version & VARIANT_MASK_INSTR_ENTRY) {
     ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
   }
+  if (Version & VARIANT_MASK_INSTR_LOOP_ENTRIES) {
+    ProfileKind |= InstrProfKind::LoopEntriesInstrumentation;
+  }
   if (Version & VARIANT_MASK_BYTE_COVERAGE) {
     ProfileKind |= InstrProfKind::SingleByteCoverage;
   }
@@ -262,6 +265,8 @@ Error TextInstrProfReader::readHeader() {
       ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
     else if (Str.equals_insensitive("not_entry_first"))
       ProfileKind &= ~InstrProfKind::FunctionEntryInstrumentation;
+    else if (Str.equals_insensitive("instrument_loop_entries"))
+      ProfileKind |= InstrProfKind::LoopEntriesInstrumentation;
     else if (Str.equals_insensitive("single_byte_coverage"))
       ProfileKind |= InstrProfKind::SingleByteCoverage;
     else if (Str.equals_insensitive("temporal_prof_traces")) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index d8ab18d213e3d..64625dee7701e 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -877,6 +877,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   if (static_cast<bool>(ProfileKind &
                         InstrProfKind::FunctionEntryInstrumentation))
     Header.Version |= VARIANT_MASK_INSTR_ENTRY;
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::LoopEntriesInstrumentation))
+    Header.Version |= VARIANT_MASK_INSTR_LOOP_ENTRIES;
   if (static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage))
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
   if (static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly))
@@ -1120,6 +1123,10 @@ Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
   if (static_cast<bool>(ProfileKind &
                         InstrProfKind::FunctionEntryInstrumentation))
     OS << "# Always instrument the function entry block\n:entry_first\n";
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::LoopEntriesInstrumentation))
+    OS << "# Always instrument the loop entry "
+          "blocks\n:instrument_loop_entries\n";
   if (static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage))
     OS << "# Instrument block coverage\n:single_byte_coverage\n";
   InstrProfSymtab Symtab;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 2d5ae1c1d14dc..51d22a893c39a 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4585,7 +4585,7 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
       break;
 
     // Reject anything that may alias the collected instructions.
-    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() || MI.isCall())
       break;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index a8ba89f784c8c..56ff7b0d3a280 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -145,8 +145,12 @@ def gi_extract_high_v4i32 :
 
 def extract_high_v8f16 :
     ComplexPattern<v4f16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+def extract_high_v8bf16 :
+    ComplexPattern<v4bf16, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 def extract_high_v4f32 :
     ComplexPattern<v2f32, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
+def extract_high_v2f64 :
+    ComplexPattern<v1f64, 1, "SelectExtractHigh", [extract_subvector, bitconvert]>;
 
 def gi_extract_high_v8f16 :
   GIComplexOperandMatcher<v4s16, "selectExtractHigh">,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7614f6215b803..d015cc15581ad 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7352,7 +7352,8 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
 // INS.
-multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
+multiclass ConcatPat<ValueType DstTy, ValueType SrcTy,
+                     ComplexPattern ExtractHigh> {
   def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
             (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
                          (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
@@ -7365,16 +7366,22 @@ multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
   // If the high lanes are undef we can just ignore them:
   def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
             (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-}
 
-defm : ConcatPat<v2i64, v1i64>;
-defm : ConcatPat<v2f64, v1f64>;
-defm : ConcatPat<v4i32, v2i32>;
-defm : ConcatPat<v4f32, v2f32>;
-defm : ConcatPat<v8i16, v4i16>;
-defm : ConcatPat<v8f16, v4f16>;
-defm : ConcatPat<v8bf16, v4bf16>;
-defm : ConcatPat<v16i8, v8i8>;
+  // Concatting the high half of two vectors is the insert of the first
+  // into the low half of the second.
+  def : Pat<(DstTy (concat_vectors (ExtractHigh (DstTy V128:$Rn)),
+                                   (ExtractHigh (DstTy V128:$Rm)))),
+            (INSvi64lane V128:$Rm, (i64 0), V128:$Rn, (i64 1))>;
+}
+
+defm : ConcatPat<v2i64, v1i64, extract_high_v2i64>;
+defm : ConcatPat<v2f64, v1f64, extract_high_v2f64>;
+defm : ConcatPat<v4i32, v2i32, extract_high_v4i32>;
+defm : ConcatPat<v4f32, v2f32, extract_high_v4f32>;
+defm : ConcatPat<v8i16, v4i16, extract_high_v8i16>;
+defm : ConcatPat<v8f16, v4f16, extract_high_v8f16>;
+defm : ConcatPat<v8bf16, v4bf16, extract_high_v8bf16>;
+defm : ConcatPat<v16i8, v8i8, extract_high_v16i8>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD across lanes instructions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 1ba6f238342cd..b9769a1baf4d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -29,6 +29,9 @@ void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
 FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
 void initializeAMDGPURegBankCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
+FunctionPass *createAMDGPURegBankSelectPass();
+FunctionPass *createAMDGPURegBankLegalizePass();
 
 // SI Passes
 FunctionPass *createGCNDPPCombinePass();
@@ -36,7 +39,6 @@ FunctionPass *createSIAnnotateControlFlowLegacyPass();
 FunctionPass *createSIFoldOperandsLegacyPass();
 FunctionPass *createSIPeepholeSDWALegacyPass();
 FunctionPass *createSILowerI1CopiesLegacyPass();
-FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
 FunctionPass *createSIShrinkInstructionsLegacyPass();
 FunctionPass *createSILoadStoreOptimizerLegacyPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -186,6 +188,12 @@ extern char &SILowerI1CopiesLegacyID;
 void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
 extern char &AMDGPUGlobalISelDivergenceLoweringID;
 
+void initializeAMDGPURegBankSelectPass(PassRegistry &);
+extern char &AMDGPURegBankSelectID;
+
+void initializeAMDGPURegBankLegalizePass(PassRegistry &);
+extern char &AMDGPURegBankLegalizeID;
+
 void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &);
 extern char &AMDGPUMarkLastScratchLoadID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ca1ae0499b9..c09c71c83fead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -66,11 +66,12 @@ class AMDGPUAtomicOptimizer : public FunctionPass {
 class AMDGPUAtomicOptimizerImpl
     : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
 private:
+  Function &F;
   SmallVector<ReplacementInfo, 8> ToReplace;
-  const UniformityInfo *UA;
-  const DataLayout *DL;
+  const UniformityInfo &UA;
+  const DataLayout &DL;
   DomTreeUpdater &DTU;
-  const GCNSubtarget *ST;
+  const GCNSubtarget &ST;
   bool IsPixelShader;
   ScanOptions ScanImpl;
 
@@ -91,13 +92,14 @@ class AMDGPUAtomicOptimizerImpl
 public:
   AMDGPUAtomicOptimizerImpl() = delete;
 
-  AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
-                            DomTreeUpdater &DTU, const GCNSubtarget *ST,
-                            bool IsPixelShader, ScanOptions ScanImpl)
-      : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+  AMDGPUAtomicOptimizerImpl(Function &F, const UniformityInfo &UA,
+                            DomTreeUpdater &DTU, const GCNSubtarget &ST,
+                            ScanOptions ScanImpl)
+      : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
+        IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS),
         ScanImpl(ScanImpl) {}
 
-  bool run(Function &F);
+  bool run();
 
   void visitAtomicRMWInst(AtomicRMWInst &I);
   void visitIntrinsicInst(IntrinsicInst &I);
@@ -114,40 +116,30 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
     return false;
   }
 
-  const UniformityInfo *UA =
-      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
-  const DataLayout *DL = &F.getDataLayout();
+  const UniformityInfo &UA =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
-  DominatorTreeWrapperPass *const DTW =
+  DominatorTreeWrapperPass *DTW =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
                      DomTreeUpdater::UpdateStrategy::Lazy);
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
-
-  return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-      .run(F);
+  return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 }
 
 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
-
-  const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
-  const DataLayout *DL = &F.getDataLayout();
+  const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
 
   DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
                      DomTreeUpdater::UpdateStrategy::Lazy);
-  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
-
-  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
 
-  bool IsChanged =
-      AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
-          .run(F);
+  bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
 
   if (!IsChanged) {
     return PreservedAnalyses::all();
@@ -158,7 +150,7 @@ PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
   return PA;
 }
 
-bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+bool AMDGPUAtomicOptimizerImpl::run() {
 
   // Scan option None disables the Pass
   if (ScanImpl == ScanOptions::None) {
@@ -234,18 +226,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
 
   // If the pointer operand is divergent, then each lane is doing an atomic
   // operation on a different address, and we cannot optimize that.
-  if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
+  if (UA.isDivergentUse(I.getOperandUse(PtrIdx))) {
     return;
   }
 
-  bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -324,14 +316,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
 
   const unsigned ValIdx = 0;
 
-  const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
+  const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
 
   // If the value operand is divergent, each lane is contributing a different
   // value to the atomic calculation. We can only optimize divergent values if
   // we have DPP available on our subtarget (for DPP strategy), and the atomic
   // operation is 32 or 64 bits.
   if (ValDivergent) {
-    if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
+    if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
       return;
 
     if (!isLegalCrossLaneType(I.getType()))
@@ -341,7 +333,7 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
   // If any of the other arguments to the intrinsic are divergent, we can't
   // optimize the operation.
   for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
-    if (UA->isDivergentUse(I.getOperandUse(Idx))) {
+    if (UA.isDivergentUse(I.getOperandUse(Idx))) {
       return;
     }
   }
@@ -418,17 +410,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   }
 
   // Reduce within each pair of rows (i.e. 32 lanes).
-  assert(ST->hasPermLaneX16());
+  assert(ST.hasPermLaneX16());
   Value *Permlanex16Call =
       B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                         {PoisonValue::get(AtomicTy), V, B.getInt32(0),
                          B.getInt32(0), B.getFalse(), B.getFalse()});
   V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     return V;
   }
 
-  if (ST->hasPermLane64()) {
+  if (ST.hasPermLane64()) {
     // Reduce across the upper and lower 32 lanes.
     Value *Permlane64Call =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
@@ -461,7 +453,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                      {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
                       B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
   }
-  if (ST->hasDPPBroadcasts()) {
+  if (ST.hasDPPBroadcasts()) {
     // GFX9 has DPP row broadcast operations.
     V = buildNonAtomicBinOp(
         B, Op, V,
@@ -479,7 +471,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
 
     // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
     // 48..63).
-    assert(ST->hasPermLaneX16());
+    assert(ST.hasPermLaneX16());
     Value *PermX =
         B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
                           {PoisonValue::get(AtomicTy), V, B.getInt32(-1),
@@ -490,7 +482,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                     B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
     V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Combine lane 31 into lanes 32..63.
       Value *const Lane31 = B.CreateIntrinsic(
           AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -513,7 +505,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::amdgcn_update_dpp, AtomicTy);
-  if (ST->hasDPPWavefrontShifts()) {
+  if (ST.hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
     V = B.CreateCall(UpdateDPP,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
@@ -535,7 +527,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
     V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
                                  B.getInt32(16), V});
 
-    if (!ST->isWave32()) {
+    if (!ST.isWave32()) {
       // Copy the old lane 31 to the new lane 32.
       V = B.CreateCall(
           WriteLane,
@@ -560,7 +552,7 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
     IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
     Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
   auto *Ty = I.getType();
-  auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  auto *WaveTy = B.getIntNTy(ST.getWavefrontSize());
   auto *EntryBB = I.getParent();
   auto NeedResult = !I.use_empty();
 
@@ -698,7 +690,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   Type *const Ty = I.getType();
   Type *Int32Ty = B.getInt32Ty();
   bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
-  [[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+  [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
 
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
@@ -706,7 +698,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
 
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
-  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
+  Type *const WaveTy = B.getIntNTy(ST.getWavefrontSize());
   CallInst *const Ballot =
       B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
@@ -715,7 +707,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
   // below us only if its associated index was less than ours. We do this by
   // using the mbcnt intrinsic.
   Value *Mbcnt;
-  if (ST->isWave32()) {
+  if (ST.isWave32()) {
     Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
                               {Ballot, B.getInt32(0)});
   } else {
@@ -755,7 +747,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
       // that they can correctly contribute to the final result.
       NewV =
           B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-      if (!NeedResult && ST->hasPermLaneX16()) {
+      if (!NeedResult && ST.hasPermLaneX16()) {
         // On GFX10 the permlanex16 instruction helps us build a reduction
         // without too many readlanes and writelanes, which are generally bad
         // for performance.
@@ -767,7 +759,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
         // Read the value from the last lane, which has accumulated the values
         // of each active lane in the wavefront. This will be our new value
         // which we will provide to the atomic operation.
-        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        Value *const LastLaneIdx = B.getInt32(ST.getWavefrontSize() - 1);
         NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
                                  {NewV, LastLaneIdx});
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7257b53afe69d..75e20c7930168 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -95,32 +95,45 @@ static cl::opt<bool> DisableFDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+static bool hasUnsafeFPMath(const Function &F) {
+  return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+}
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
-  const GCNSubtarget *ST = nullptr;
-  const AMDGPUTargetMachine *TM = nullptr;
-  const TargetLibraryInfo *TLInfo = nullptr;
-  AssumptionCache *AC = nullptr;
-  DominatorTree *DT = nullptr;
-  UniformityInfo *UA = nullptr;
-  Module *Mod = nullptr;
-  const DataLayout *DL = nullptr;
-  bool HasUnsafeFPMath = false;
-  bool HasFP32DenormalFlush = false;
+  Function &F;
+  const GCNSubtarget &ST;
+  const AMDGPUTargetMachine &TM;
+  const TargetLibraryInfo *TLI;
+  AssumptionCache *AC;
+  const DominatorTree *DT;
+  const UniformityInfo &UA;
+  const DataLayout &DL;
+  const bool HasUnsafeFPMath;
+  const bool HasFP32DenormalFlush;
   bool FlowChanged = false;
   mutable Function *SqrtF32 = nullptr;
   mutable Function *LdexpF32 = nullptr;
 
   DenseMap<const PHINode *, bool> BreakPhiNodesCache;
 
+  AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
+                           const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                           const DominatorTree *DT, const UniformityInfo &UA)
+      : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
+        DT(DT), UA(UA), DL(F.getDataLayout()),
+        HasUnsafeFPMath(hasUnsafeFPMath(F)),
+        HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
+                             DenormalMode::getPreserveSign()) {}
+
   Function *getSqrtF32() const {
     if (SqrtF32)
       return SqrtF32;
 
-    LLVMContext &Ctx = Mod->getContext();
-    SqrtF32 = Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_sqrt,
-                                                {Type::getFloatTy(Ctx)});
+    LLVMContext &Ctx = F.getContext();
+    SqrtF32 = Intrinsic::getOrInsertDeclaration(
+        F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
     return SqrtF32;
   }
 
@@ -128,9 +141,10 @@ class AMDGPUCodeGenPrepareImpl
     if (LdexpF32)
       return LdexpF32;
 
-    LLVMContext &Ctx = Mod->getContext();
+    LLVMContext &Ctx = F.getContext();
     LdexpF32 = Intrinsic::getOrInsertDeclaration(
-        Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
+        F.getParent(), Intrinsic::ldexp,
+        {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
     return LdexpF32;
   }
 
@@ -166,8 +180,7 @@ class AMDGPUCodeGenPrepareImpl
   /// Wrapper to pass all the arguments to computeKnownFPClass
   KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
                                    const Instruction *CtxI) const {
-    return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
-                                     DT);
+    return llvm::computeKnownFPClass(V, DL, Interested, 0, TLI, AC, CtxI, DT);
   }
 
   bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
@@ -317,13 +330,10 @@ class AMDGPUCodeGenPrepareImpl
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
   bool visitMinNum(IntrinsicInst &I);
   bool visitSqrt(IntrinsicInst &I);
-  bool run(Function &F);
+  bool run();
 };
 
 class AMDGPUCodeGenPrepare : public FunctionPass {
-private:
-  AMDGPUCodeGenPrepareImpl Impl;
-
 public:
   static char ID;
   AMDGPUCodeGenPrepare() : FunctionPass(ID) {
@@ -339,13 +349,12 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
       AU.setPreservesAll();
   }
   bool runOnFunction(Function &F) override;
-  bool doInitialization(Module &M) override;
   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 };
 
 } // end anonymous namespace
 
-bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
+bool AMDGPUCodeGenPrepareImpl::run() {
   BreakPhiNodesCache.clear();
   bool MadeChange = false;
 
@@ -411,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     // TODO: The set of packed operations is more limited, so may want to
     // promote some anyway.
-    if (ST->hasVOP3PInsts())
+    if (ST.hasVOP3PInsts())
       return false;
 
     return needsPromotionToI32(VT->getElementType());
@@ -422,7 +431,7 @@ bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
 
 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
   return Ty->isFloatTy() || Ty->isDoubleTy() ||
-         (Ty->isHalfTy() && ST->has16BitInsts());
+         (Ty->isHalfTy() && ST.has16BitInsts());
 }
 
 // Return true if the op promoted to i32 should have nsw set.
@@ -455,11 +464,10 @@ static bool promotedOpIsNUW(const Instruction &I) {
 
 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
   Type *Ty = I.getType();
-  const DataLayout &DL = Mod->getDataLayout();
   int TySize = DL.getTypeSizeInBits(Ty);
   Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
 
-  return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
+  return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
 }
 
 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -591,11 +599,11 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
 }
 
 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
-  return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
+  return computeKnownBits(Op, DL, 0, AC).countMaxActiveBits();
 }
 
 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
-  return ComputeMaxSignificantBits(Op, *DL, 0, AC);
+  return ComputeMaxSignificantBits(Op, DL, 0, AC);
 }
 
 static void extractValues(IRBuilder<> &Builder,
@@ -631,11 +639,11 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
 
   Type *Ty = I.getType();
   unsigned Size = Ty->getScalarSizeInBits();
-  if (Size <= 16 && ST->has16BitInsts())
+  if (Size <= 16 && ST.has16BitInsts())
     return false;
 
   // Prefer scalar if this could be s_mul_i32
-  if (UA->isUniform(&I))
+  if (UA.isUniform(&I))
     return false;
 
   Value *LHS = I.getOperand(0);
@@ -646,11 +654,11 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
   unsigned LHSBits = 0, RHSBits = 0;
   bool IsSigned = false;
 
-  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
+  if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
       (RHSBits = numBitsUnsigned(RHS)) <= 24) {
     IsSigned = false;
 
-  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
+  } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
              (RHSBits = numBitsSigned(RHS)) <= 24) {
     IsSigned = true;
 
@@ -730,21 +738,21 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
   if (CastOp) {
     if (!CastOp->hasOneUse())
       return false;
-    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
-    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
+    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
   }
 
   // TODO: Handle special 0/-1 cases DAG combine does, although we only really
   // need to handle divisions here.
-  Constant *FoldedT = SelOpNo ?
-    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
-    ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
+  Constant *FoldedT =
+      SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
+              : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
   if (!FoldedT || isa<ConstantExpr>(FoldedT))
     return false;
 
-  Constant *FoldedF = SelOpNo ?
-    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
-    ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
+  Constant *FoldedF =
+      SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
+              : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
   if (!FoldedF || isa<ConstantExpr>(FoldedF))
     return false;
 
@@ -777,7 +785,7 @@ AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
   // result? It's unspecified by the spec.
 
   Value *FrexpExp =
-      ST->hasFractBug()
+      ST.hasFractBug()
           ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
                                     {Builder.getInt32Ty(), Ty}, Src)
           : Builder.CreateExtractValue(Frexp, {1});
@@ -815,7 +823,7 @@ Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
   // If we have have to work around the fract/frexp bug, we're worse off than
   // using the fdiv.fast expansion. The full safe expansion is faster if we have
   // fast FMA.
-  if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
+  if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
       (!FMF.noNaNs() || !FMF.noInfs()))
     return nullptr;
 
@@ -1157,17 +1165,12 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
   if (NewVal) {
     FDiv.replaceAllUsesWith(NewVal);
     NewVal->takeName(&FDiv);
-    RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
+    RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI);
   }
 
   return true;
 }
 
-static bool hasUnsafeFPMath(const Function &F) {
-  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
-  return Attr.getValueAsBool();
-}
-
 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
                                           Value *LHS, Value *RHS) {
   Type *I32Ty = Builder.getInt32Ty();
@@ -1192,7 +1195,6 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
                                             Value *Den, unsigned AtLeast,
                                             bool IsSigned) const {
-  const DataLayout &DL = Mod->getDataLayout();
   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
   if (LHSSignBits < AtLeast)
     return -1;
@@ -1271,7 +1273,7 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
   Value *FQNeg = Builder.CreateFNeg(FQ);
 
   // float fr = mad(fqneg, fb, fa);
-  auto FMAD = !ST->hasMadMacF32Insts()
+  auto FMAD = !ST.hasMadMacF32Insts()
                   ? Intrinsic::fma
                   : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
   Value *FR = Builder.CreateIntrinsic(FMAD,
@@ -1338,7 +1340,7 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
     // If there's no wider mulhi, there's only a better expansion for powers of
     // two.
     // TODO: Should really know for each vector element.
-    if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
+    if (isKnownToBeAPowerOfTwo(C, DL, true, 0, AC, &I, DT))
       return true;
 
     return false;
@@ -1348,8 +1350,8 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
     // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
     if (BinOpDen->getOpcode() == Instruction::Shl &&
         isa<Constant>(BinOpDen->getOperand(0)) &&
-        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
-                               0, AC, &I, DT)) {
+        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, 0, AC, &I,
+                               DT)) {
       return true;
     }
   }
@@ -1357,9 +1359,9 @@ bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
   return false;
 }
 
-static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
+static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
   // Check whether the sign can be determined statically.
-  KnownBits Known = computeKnownBits(V, *DL);
+  KnownBits Known = computeKnownBits(V, DL);
   if (Known.isNegative())
     return Constant::getAllOnesValue(V->getType());
   if (Known.isNonNegative())
@@ -1542,8 +1544,8 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
   if (foldBinOpIntoSelect(I))
     return true;
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      UA->isUniform(&I) && promoteUniformOpToI32(I))
+  if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      UA.isUniform(&I) && promoteUniformOpToI32(I))
     return true;
 
   if (UseMul24Intrin && replaceMulWithMul24(I))
@@ -1655,11 +1657,11 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
         };
 
         WidenLoad->setMetadata(LLVMContext::MD_range,
-                               MDNode::get(Mod->getContext(), LowAndHigh));
+                               MDNode::get(F.getContext(), LowAndHigh));
       }
     }
 
-    int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
+    int TySize = DL.getTypeSizeInBits(I.getType());
     Type *IntNTy = Builder.getIntNTy(TySize);
     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
@@ -1674,8 +1676,8 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
-      UA->isUniform(&I))
+  if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
+      UA.isUniform(&I))
     Changed |= promoteUniformOpToI32(I);
 
   return Changed;
@@ -1688,8 +1690,8 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
   Value *CmpVal;
   FCmpInst::Predicate Pred;
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
-    if (UA->isUniform(&I))
+  if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) {
+    if (UA.isUniform(&I))
       return promoteUniformOpToI32(I);
     return false;
   }
@@ -1722,7 +1724,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
 
   Fract->takeName(&I);
   I.replaceAllUsesWith(Fract);
-  RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
+  RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
   return true;
 }
 
@@ -1947,7 +1949,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
 
   FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
   if (!FVT || FVT->getNumElements() == 1 ||
-      DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
+      DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
     return false;
 
   if (!ForceBreakLargePHIs && !canBreakPHINode(I))
@@ -1960,7 +1962,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
     unsigned Idx = 0;
     // For 8/16 bits type, don't scalarize fully but break it up into as many
     // 32-bit slices as we can, and scalarize the tail.
-    const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
+    const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
     const unsigned NumElts = FVT->getNumElements();
     if (EltSize == 8 || EltSize == 16) {
       const unsigned SubVecSize = (32 / EltSize);
@@ -2079,7 +2081,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
   SmallVector<const Value *, 4> WorkList;
   getUnderlyingObjects(I.getOperand(0), WorkList);
   if (!all_of(WorkList, [&](const Value *V) {
-        return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
+        return isPtrKnownNeverNull(V, DL, TM, SrcAS);
       }))
     return false;
 
@@ -2107,8 +2109,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
   bool Changed = false;
 
-  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
-      UA->isUniform(&I))
+  if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      UA.isUniform(&I))
     Changed |= promoteUniformBitreverseToI32(I);
 
   return Changed;
@@ -2120,7 +2122,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
 /// If fract is a useful instruction for the subtarget. Does not account for the
 /// nan handling; the instruction has a nan check on the input value.
 Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
-  if (ST->hasFractBug())
+  if (ST.hasFractBug())
     return nullptr;
 
   if (I.getIntrinsicID() != Intrinsic::minnum)
@@ -2177,7 +2179,7 @@ bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
   // Match pattern for fract intrinsic in contexts where the nan check has been
   // optimized out (and hope the knowledge the source can't be nan wasn't lost).
   if (!I.hasNoNaNs() &&
-      !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
+      !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(DL, TLI)))
     return false;
 
   IRBuilder<> Builder(&I);
@@ -2189,7 +2191,7 @@ bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
   Fract->takeName(&I);
   I.replaceAllUsesWith(Fract);
 
-  RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
+  RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
   return true;
 }
 
@@ -2201,7 +2203,7 @@ static bool isOneOrNegOne(const Value *Val) {
 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
   Type *Ty = Sqrt.getType()->getScalarType();
-  if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
+  if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
@@ -2257,14 +2259,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
   return true;
 }
 
-bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
-  Impl.Mod = &M;
-  Impl.DL = &Impl.Mod->getDataLayout();
-  Impl.SqrtF32 = nullptr;
-  Impl.LdexpF32 = nullptr;
-  return false;
-}
-
 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -2274,36 +2268,26 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     return false;
 
   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
-  Impl.TM = &TM;
-  Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
-  Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
-  SIModeRegisterDefaults Mode(F, *Impl.ST);
-  Impl.HasFP32DenormalFlush =
-      Mode.FP32Denormals == DenormalMode::getPreserveSign();
-  return Impl.run(F);
+  const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  const UniformityInfo &UA =
+      getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
 }
 
 PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
                                                 FunctionAnalysisManager &FAM) {
-  AMDGPUCodeGenPrepareImpl Impl;
-  Impl.Mod = F.getParent();
-  Impl.DL = &Impl.Mod->getDataLayout();
-  Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
-  Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
-  Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
-  Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
-  Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
-  Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-  Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
-  SIModeRegisterDefaults Mode(F, *Impl.ST);
-  Impl.HasFP32DenormalFlush =
-      Mode.FP32Denormals == DenormalMode::getPreserveSign();
-  if (!Impl.run(F))
+  const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
+  const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
+  AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
+  const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
+  AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
+  if (!Impl.run())
     return PreservedAnalyses::all();
   PreservedAnalyses PA = PreservedAnalyses::none();
   if (!Impl.FlowChanged)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
new file mode 100644
index 0000000000000..283173deaeedc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -0,0 +1,79 @@
+//===-- AMDGPURegBankLegalize.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Lower G_ instructions that can't be inst-selected with register bank
+/// assignment from AMDGPURegBankSelect based on machine uniformity info.
+/// Given types on all operands, some register bank assignments require lowering
+/// while others do not.
+/// Note: cases where all register bank assignments would require lowering are
+/// lowered in legalizer.
+/// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not.
+/// Eliminate sgpr S1 by lowering to sgpr S32.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-regbanklegalize"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPURegBankLegalize : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  AMDGPURegBankLegalize() : MachineFunctionPass(ID) {
+    initializeAMDGPURegBankLegalizePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Register Bank Legalize";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // If there were no phis and we do waterfall expansion machine verifier would
+  // fail.
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE,
+                      "AMDGPU Register Bank Legalize", false, false)
+INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE,
+                    "AMDGPU Register Bank Legalize", false, false)
+
+char AMDGPURegBankLegalize::ID = 0;
+
+char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID;
+
+FunctionPass *llvm::createAMDGPURegBankLegalizePass() {
+  return new AMDGPURegBankLegalize();
+}
+
+using namespace AMDGPU;
+
+bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
new file mode 100644
index 0000000000000..4c499cb4dfe20
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -0,0 +1,74 @@
+//===-- AMDGPURegBankSelect.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Assign register banks to all register operands of G_ instructions using
+/// machine uniformity analysis.
+/// Sgpr - uniform values and some lane masks
+/// Vgpr - divergent, non S1, values
+/// Vcc  - divergent S1 values(lane masks)
+/// However in some cases G_ instructions with this register bank assignment
+/// can't be inst-selected. This is solved in AMDGPURegBankLegalize.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-regbankselect"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPURegBankSelect : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPURegBankSelect() : MachineFunctionPass(ID) {
+    initializeAMDGPURegBankSelectPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Register Bank Select";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // This pass assigns register banks to all virtual registers, and we maintain
+  // this property in subsequent passes
+  MachineFunctionProperties getSetProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::RegBankSelected);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, DEBUG_TYPE,
+                      "AMDGPU Register Bank Select", false, false)
+INITIALIZE_PASS_END(AMDGPURegBankSelect, DEBUG_TYPE,
+                    "AMDGPU Register Bank Select", false, false)
+
+char AMDGPURegBankSelect::ID = 0;
+
+char &llvm::AMDGPURegBankSelectID = AMDGPURegBankSelect::ID;
+
+FunctionPass *llvm::createAMDGPURegBankSelectPass() {
+  return new AMDGPURegBankSelect();
+}
+
+bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a5eb7cbf1bb10..6e2eb254ff60c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -448,6 +448,12 @@ static cl::opt<bool>
                            cl::desc("Enable AMDGPUAttributorPass"),
                            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> NewRegBankSelect(
+    "new-reg-bank-select",
+    cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
+             "regbankselect"),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -464,6 +470,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNDPPCombineLegacyPass(*PR);
   initializeSILowerI1CopiesLegacyPass(*PR);
   initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
+  initializeAMDGPURegBankSelectPass(*PR);
+  initializeAMDGPURegBankLegalizePass(*PR);
   initializeSILowerWWMCopiesPass(*PR);
   initializeAMDGPUMarkLastScratchLoadPass(*PR);
   initializeSILowerSGPRSpillsLegacyPass(*PR);
@@ -1385,7 +1393,12 @@ void GCNPassConfig::addPreRegBankSelect() {
 }
 
 bool GCNPassConfig::addRegBankSelect() {
-  addPass(new RegBankSelect());
+  if (NewRegBankSelect) {
+    addPass(createAMDGPURegBankSelectPass());
+    addPass(createAMDGPURegBankLegalizePass());
+  } else {
+    addPass(new RegBankSelect());
+  }
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index b0197c3c6c280..68d141e338a88 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -92,6 +92,8 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPromoteAlloca.cpp
   AMDGPUPromoteKernelArguments.cpp
   AMDGPURegBankCombiner.cpp
+  AMDGPURegBankLegalize.cpp
+  AMDGPURegBankSelect.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPURemoveIncompatibleFunctions.cpp
   AMDGPUReserveWWMRegs.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index fa39df9ae5ba8..4ff6fc32b642d 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -224,8 +224,13 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
   if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
-    if (L->contains(Inst)) {
+    if (LI->getLoopFor(Parent) == L) {
+      // Insert IfBreak in the same BB as Cond, which can help
+      // SILowerControlFlow to know that it does not have to insert an
+      // AND with EXEC.
       Insert = Parent->getTerminator();
+    } else if (L->contains(Inst)) {
+      Insert = Term;
     } else {
       Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 065abde62af8a..ff9376e635af9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1457,6 +1457,23 @@ let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
   defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
 } // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
 
+class AshrPkI8Pat<VOP3_Pseudo inst, int lo, int hi>: GCNPat<
+    (i16 (or (i16 (shl (i16 (trunc (i32 (AMDGPUsmed3 (i32 (sra i32:$src1, i32:$src2)), (i32 lo), (i32 hi))))), (i16 8))),
+             (i16 (and (i16 (trunc (i32 (AMDGPUsmed3 (i32 (sra i32:$src0, i32:$src2)), (i32 lo), (i32 hi))))), (i16 255))))),
+    (inst 0, VSrc_b32:$src0, 0,  VSrc_b32:$src1, 0, VSrc_b32:$src2, 0 )
+>;
+
+class AshrPkU8Pat<VOP3_Pseudo inst, int lo, int hi>: GCNPat<
+    (i16 (or (i16 (shl (i16 (trunc (i32 (AMDGPUsmed3 (i32 (sra i32:$src1, i32:$src2)), (i32 lo), (i32 hi))))), (i16 8))),
+             (i16 (trunc (i32 (AMDGPUsmed3 (i32 (sra i32:$src0, i32:$src2)), (i32 lo), (i32 hi))))))),
+    (inst 0, VSrc_b32:$src0, 0,  VSrc_b32:$src1, 0, VSrc_b32:$src2, 0 )
+>;
+
+let SubtargetPredicate = HasAshrPkInsts in {
+  def : AshrPkI8Pat<V_ASHR_PK_I8_I32_e64, -128, 127>;
+  def : AshrPkU8Pat<V_ASHR_PK_U8_I32_e64, 0, 255>;
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Clamp Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
index f0e6837d49a97..d682b7dbe3ce2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp
@@ -37,7 +37,7 @@ class LoongArchDeadRegisterDefinitions : public MachineFunctionPass {
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addRequired<LiveIntervalsWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
-    AU.addPreserved<LiveDebugVariables>();
+    AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
     AU.addPreserved<LiveStacks>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 16bceacfaa222..5a21ac7ebba0d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6404,8 +6404,8 @@ ISD::NodeType LoongArchTargetLowering::getExtendForAtomicCmpSwapArg() const {
 }
 
 bool LoongArchTargetLowering::shouldSignExtendTypeInLibCall(
-    EVT Type, bool IsSigned) const {
-  if (Subtarget.is64Bit() && Type == MVT::i32)
+    Type *Ty, bool IsSigned) const {
+  if (Subtarget.is64Bit() && Ty->isIntegerTy(32))
     return true;
 
   return IsSigned;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 605093b01476d..e6de0dc4e361a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -273,7 +273,7 @@ class LoongArchTargetLowering : public TargetLowering {
     return false;
   }
   bool shouldConsiderGEPOffsetSplit() const override { return true; }
-  bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+  bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
   bool shouldExtendTypeInLibCall(EVT Type) const override;
 
   bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index d90348153fd3e..036b59c57d5b0 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3871,10 +3871,10 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_Mips);
 }
 
-bool MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type,
+bool MipsTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
                                                        bool IsSigned) const {
-  if ((ABI.IsN32() || ABI.IsN64()) && Type == MVT::i32)
-      return true;
+  if ((ABI.IsN32() || ABI.IsN64()) && Ty->isIntegerTy(32))
+    return true;
 
   return IsSigned;
 }
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 8033898091c75..e245c056de649 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -623,7 +623,7 @@ class TargetRegisterClass;
     SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
                                  const SDLoc &DL, SelectionDAG &DAG) const;
 
-    bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+    bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
 
     // Inline asm support
     ConstraintType getConstraintType(StringRef Constraint) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index e917ef3f5e8c9..564fa29bce7d2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18828,7 +18828,7 @@ SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   SDValue Callee =
       DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
-  bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false);
+  bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetTy, false);
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (const SDValue &N : Op->op_values()) {
@@ -18836,7 +18836,7 @@ SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = N;
     Entry.Ty = ArgTy;
-    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend);
+    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgTy, SignExtend);
     Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
   }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 2b64ab9aa6973..11d7ea68312fb 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -99,11 +99,6 @@ static cl::opt<bool>
                   cl::desc("Expand eligible cr-logical binary ops to branches"),
                   cl::init(true), cl::Hidden);
 
-static cl::opt<bool> MergeStringPool(
-    "ppc-merge-string-pool",
-    cl::desc("Merge all of the strings in a module into one pool"),
-    cl::init(true), cl::Hidden);
-
 static cl::opt<bool> EnablePPCGenScalarMASSEntries(
     "enable-ppc-gen-scalar-mass", cl::init(false),
     cl::desc("Enable lowering math functions to their corresponding MASS "
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index a8052839b5c6a..911827da06197 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -520,11 +520,13 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder(G_FPTRUNC)
       .legalFor(ST.hasStdExtD(), {{s32, s64}})
       .legalFor(ST.hasStdExtZfh(), {{s16, s32}})
-      .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s16, s64}});
+      .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s16, s64}})
+      .libcallFor({{s32, s64}});
   getActionDefinitionsBuilder(G_FPEXT)
       .legalFor(ST.hasStdExtD(), {{s64, s32}})
       .legalFor(ST.hasStdExtZfh(), {{s32, s16}})
-      .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s64, s16}});
+      .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s64, s16}})
+      .libcallFor({{s64, s32}});
 
   getActionDefinitionsBuilder(G_FCMP)
       .legalFor(ST.hasStdExtF(), {{sXLen, s32}})
diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 4b35f3bb0a524..7bcf3397df97e 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -37,7 +37,7 @@ class RISCVDeadRegisterDefinitions : public MachineFunctionPass {
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addRequired<LiveIntervalsWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
-    AU.addPreserved<LiveDebugVariables>();
+    AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
     AU.addPreserved<LiveStacks>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d423fd421873f..cfb7ff7696191 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4440,40 +4440,15 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
   // a zero extend and the simm5 check in isel would fail.
   // FIXME: Should we ignore the upper bits in isel instead?
   unsigned ExtOpc =
-    isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+      isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
   Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
-  return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT,
-                     DAG.getUNDEF(VT), Scalar, VL);
-}
-
-// Is this a shuffle extracts either the even or odd elements of a vector?
-// That is, specifically, either (a) or (b) in the options below.
-// Single operand shuffle is easy:
-//   a) t35: v8i8 = vector_shuffle<0,2,4,6,u,u,u,u> t34, undef
-//   b) t35: v8i8 = vector_shuffle<1,3,5,7,u,u,u,u> t34, undef
-// Double operand shuffle:
-//   t34: v8i8 = extract_subvector t11, Constant:i64<0>
-//   t33: v8i8 = extract_subvector t11, Constant:i64<8>
-//   a) t35: v8i8 = vector_shuffle<0,2,4,6,8,10,12,14> t34, t33
-//   b) t35: v8i8 = vector_shuffle<1,3,5,7,9,11,13,15> t34, t33
-static SDValue isDeinterleaveShuffle(MVT VT, MVT ContainerVT, SDValue V1,
-                                     SDValue V2, ArrayRef<int> Mask,
-                                     const RISCVSubtarget &Subtarget) {
-  // Need to be able to widen the vector.
-  if (VT.getScalarSizeInBits() >= Subtarget.getELen())
-    return SDValue();
-
-  // First index must be the first even or odd element from V1.
-  if (Mask[0] != 0 && Mask[0] != 1)
-    return SDValue();
-
-  // The others must increase by 2 each time.
-  for (unsigned i = 1; i != Mask.size(); ++i)
-    if (Mask[i] != -1 && Mask[i] != Mask[0] + (int)i * 2)
-      return SDValue();
+  return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), Scalar,
+                     VL);
+}
 
-  if (1 == count_if(Mask, [](int Idx) { return Idx != -1; }))
-    return SDValue();
+// Can this shuffle be performed on exactly one (possibly larger) input?
+static SDValue getSingleShuffleSrc(MVT VT, MVT ContainerVT, SDValue V1,
+                                   SDValue V2) {
 
   if (V2.isUndef() &&
       RISCVTargetLowering::getLMUL(ContainerVT) != RISCVII::VLMUL::LMUL_8)
@@ -4490,12 +4465,13 @@ static SDValue isDeinterleaveShuffle(MVT VT, MVT ContainerVT, SDValue V1,
     return SDValue();
 
   // Src needs to have twice the number of elements.
-  if (Src.getValueType().getVectorNumElements() != (Mask.size() * 2))
+  unsigned NumElts = VT.getVectorNumElements();
+  if (Src.getValueType().getVectorNumElements() != (NumElts * 2))
     return SDValue();
 
   // The extracts must extract the two halves of the source.
   if (V1.getConstantOperandVal(1) != 0 ||
-      V2.getConstantOperandVal(1) != Mask.size())
+      V2.getConstantOperandVal(1) != NumElts)
     return SDValue();
 
   return Src;
@@ -4612,36 +4588,29 @@ static int isElementRotate(int &LoSrc, int &HiSrc, ArrayRef<int> Mask) {
   return Rotation;
 }
 
-// Lower a deinterleave shuffle to vnsrl.
-// [a, p, b, q, c, r, d, s] -> [a, b, c, d] (EvenElts == true)
-//                          -> [p, q, r, s] (EvenElts == false)
-// VT is the type of the vector to return, <[vscale x ]n x ty>
-// Src is the vector to deinterleave of type <[vscale x ]n*2 x ty>
-static SDValue getDeinterleaveViaVNSRL(const SDLoc &DL, MVT VT, SDValue Src,
-                                       bool EvenElts, SelectionDAG &DAG) {
-  // The result is a vector of type <m x n x ty>.  The source is a vector of
-  // type <m x n*2 x ty> (For the single source case, the high half is undef)
-  if (Src.getValueType() == VT) {
-    EVT WideVT = VT.getDoubleNumVectorElementsVT();
-    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, DAG.getUNDEF(WideVT),
-                      Src, DAG.getVectorIdxConstant(0, DL));
-  }
-
-  // Bitcast the source vector from <m x n*2 x ty> -> <m x n x ty*2>
-  // This also converts FP to int.
+// Lower a deinterleave shuffle to SRL and TRUNC.  Factor must be
+// 2, 4, 8 and the integer type Factor-times larger than VT's
+// element type must be a legal element type.
+// [a, p, b, q, c, r, d, s] -> [a, b, c, d] (Factor=2, Index=0)
+//                          -> [p, q, r, s] (Factor=2, Index=1)
+static SDValue getDeinterleaveShiftAndTrunc(const SDLoc &DL, MVT VT,
+                                            SDValue Src, unsigned Factor,
+                                            unsigned Index, SelectionDAG &DAG) {
   unsigned EltBits = VT.getScalarSizeInBits();
-  MVT WideSrcVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * 2),
-                                   VT.getVectorElementCount());
+  ElementCount SrcEC = Src.getValueType().getVectorElementCount();
+  MVT WideSrcVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Factor),
+                                   SrcEC.divideCoefficientBy(Factor));
+  MVT ResVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits),
+                               SrcEC.divideCoefficientBy(Factor));
   Src = DAG.getBitcast(WideSrcVT, Src);
 
-  MVT IntVT = VT.changeVectorElementTypeToInteger();
-
-  // If we want even elements, then the shift amount is 0. Otherwise, shift by
-  // the original element size.
-  unsigned Shift = EvenElts ? 0 : EltBits;
+  unsigned Shift = Index * EltBits;
   SDValue Res = DAG.getNode(ISD::SRL, DL, WideSrcVT, Src,
                             DAG.getConstant(Shift, DL, WideSrcVT));
-  Res = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Res);
+  Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT, Res);
+  MVT IntVT = VT.changeVectorElementTypeToInteger();
+  Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, IntVT, DAG.getUNDEF(IntVT), Res,
+                    DAG.getVectorIdxConstant(0, DL));
   return DAG.getBitcast(VT, Res);
 }
 
@@ -5332,11 +5301,24 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   if (ShuffleVectorInst::isReverseMask(Mask, NumElts) && V2.isUndef())
     return DAG.getNode(ISD::VECTOR_REVERSE, DL, VT, V1);
 
-  // If this is a deinterleave and we can widen the vector, then we can use
-  // vnsrl to deinterleave.
-  if (SDValue Src =
-          isDeinterleaveShuffle(VT, ContainerVT, V1, V2, Mask, Subtarget))
-    return getDeinterleaveViaVNSRL(DL, VT, Src, Mask[0] == 0, DAG);
+  // If this is a deinterleave(2,4,8) and we can widen the vector, then we can
+  // use shift and truncate to perform the shuffle.
+  // TODO: For Factor=6, we can perform the first step of the deinterleave via
+  // shift-and-trunc reducing total cost for everything except an mf8 result.
+  // TODO: For Factor=4,8, we can do the same when the ratio isn't high enough
+  // to do the entire operation.
+  if (VT.getScalarSizeInBits() < Subtarget.getELen()) {
+    const unsigned MaxFactor = Subtarget.getELen() / VT.getScalarSizeInBits();
+    assert(MaxFactor == 2 || MaxFactor == 4 || MaxFactor == 8);
+    for (unsigned Factor = 2; Factor <= MaxFactor; Factor <<= 1) {
+      unsigned Index = 0;
+      if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index) &&
+          1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
+        if (SDValue Src = getSingleShuffleSrc(VT, ContainerVT, V1, V2))
+          return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
+      }
+    }
+  }
 
   if (SDValue V =
           lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
@@ -10736,32 +10718,37 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT,
                                Op.getOperand(0), Op.getOperand(1));
 
-  // We want to operate on all lanes, so get the mask and VL and mask for it
-  auto [Mask, VL] = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget);
-  SDValue Passthru = DAG.getUNDEF(ConcatVT);
-
   // We can deinterleave through vnsrl.wi if the element type is smaller than
   // ELEN
   if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
-    SDValue Even = getDeinterleaveViaVNSRL(DL, VecVT, Concat, true, DAG);
-    SDValue Odd = getDeinterleaveViaVNSRL(DL, VecVT, Concat, false, DAG);
+    SDValue Even = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 0, DAG);
+    SDValue Odd = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 1, DAG);
     return DAG.getMergeValues({Even, Odd}, DL);
   }
 
   // For the indices, use the same SEW to avoid an extra vsetvli
+  // TODO: If container type is larger than m1, we can consider using a splat
+  // of a constant instead of the following sequence
+
+  // Create a vector of even indices {0, 1, 2, ...}
   MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger();
-  // Create a vector of even indices {0, 2, 4, ...}
-  SDValue EvenIdx =
-      DAG.getStepVector(DL, IdxVT, APInt(IdxVT.getScalarSizeInBits(), 2));
-  // Create a vector of odd indices {1, 3, 5, ... }
-  SDValue OddIdx =
-      DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT));
-
-  // Gather the even and odd elements into two separate vectors
-  SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
-                                 Concat, EvenIdx, Passthru, Mask, VL);
-  SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ConcatVT,
-                                Concat, OddIdx, Passthru, Mask, VL);
+  SDValue StepVec = DAG.getStepVector(DL, IdxVT);
+  // 0, 1, 0, 1, 0, 1
+  SDValue ZeroOnes =
+      DAG.getNode(ISD::AND, DL, IdxVT, StepVec, DAG.getConstant(1, DL, IdxVT));
+  MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
+  SDValue EvenMask =
+      DAG.getSetCC(DL, MaskVT, ZeroOnes, DAG.getConstant(0, DL, IdxVT),
+                   ISD::CondCode::SETEQ);
+  // Have the latter be the not of the former to minimize the live range of
+  // the index vector since that might be large.
+  SDValue OddMask = DAG.getLogicalNOT(DL, EvenMask, MaskVT);
+
+  // vcompress the even and odd elements into two separate vectors
+  SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+                                 EvenMask, DAG.getUNDEF(ConcatVT));
+  SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+                                OddMask, DAG.getUNDEF(ConcatVT));
 
   // Extract the result half of the gather for even and odd
   SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
@@ -18073,6 +18060,20 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue N0 = N->getOperand(0);
     EVT VT = N->getValueType(0);
     EVT SrcVT = N0.getValueType();
+    if (VT.isRISCVVectorTuple() && N0->getOpcode() == ISD::SPLAT_VECTOR) {
+      unsigned NF = VT.getRISCVVectorTupleNumFields();
+      unsigned NumScalElts = VT.getSizeInBits().getKnownMinValue() / (NF * 8);
+      SDValue EltVal = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+      MVT ScalTy = MVT::getScalableVectorVT(MVT::getIntegerVT(8), NumScalElts);
+
+      SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, ScalTy, EltVal);
+
+      SDValue Result = DAG.getUNDEF(VT);
+      for (unsigned i = 0; i < NF; ++i)
+        Result = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VT, Result, Splat,
+                             DAG.getVectorIdxConstant(i, DL));
+      return Result;
+    }
     // If this is a bitcast between a MVT::v4i1/v2i1/v1i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((SrcVT == MVT::v1i1 || SrcVT == MVT::v2i1 || SrcVT == MVT::v4i1) &&
@@ -21298,8 +21299,9 @@ bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
   return true;
 }
 
-bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
-  if (Subtarget.is64Bit() && Type == MVT::i32)
+bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
+                                                        bool IsSigned) const {
+  if (Subtarget.is64Bit() && Ty->isIntegerTy(32))
     return true;
 
   return IsSigned;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index c753469562eba..bb0d9a71abf7e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -746,7 +746,7 @@ class RISCVTargetLowering : public TargetLowering {
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
   bool shouldExtendTypeInLibCall(EVT Type) const override;
-  bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+  bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
 
   /// Returns the register with the specified architectural or ABI name. This
   /// method is necessary to lower the llvm.read_register.* and
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 421150a370199..870e393b40411 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -889,7 +889,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
     AU.addUsedIfAvailable<LiveIntervalsWrapperPass>();
     AU.addPreserved<LiveIntervalsWrapperPass>();
     AU.addPreserved<SlotIndexesWrapperPass>();
-    AU.addPreserved<LiveDebugVariables>();
+    AU.addPreserved<LiveDebugVariablesWrapperLegacy>();
     AU.addPreserved<LiveStacks>();
 
     MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 8e0c4826ac00d..6506b6746b151 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1726,7 +1726,7 @@ foreach n = [1, 2, 4, 8] in {
   def VMV#n#R_V  : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd),
                            (ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
                    VMVRSched<n> {
-    let Uses = [];
+    let Uses = [VTYPE];
     let vm = 1;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 9a5a48333904c..aed476db1956f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1013,20 +1013,65 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::sadd_sat:
   case Intrinsic::ssub_sat:
   case Intrinsic::uadd_sat:
-  case Intrinsic::usub_sat:
+  case Intrinsic::usub_sat: {
+    auto LT = getTypeLegalizationCost(RetTy);
+    if (ST->hasVInstructions() && LT.second.isVector()) {
+      unsigned Op;
+      switch (ICA.getID()) {
+      case Intrinsic::sadd_sat:
+        Op = RISCV::VSADD_VV;
+        break;
+      case Intrinsic::ssub_sat:
+        Op = RISCV::VSSUBU_VV;
+        break;
+      case Intrinsic::uadd_sat:
+        Op = RISCV::VSADDU_VV;
+        break;
+      case Intrinsic::usub_sat:
+        Op = RISCV::VSSUBU_VV;
+        break;
+      }
+      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+    }
+    break;
+  }
   case Intrinsic::fabs:
   case Intrinsic::sqrt: {
     auto LT = getTypeLegalizationCost(RetTy);
-    if (ST->hasVInstructions() && LT.second.isVector())
-      return LT.first;
+    // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
+    if (ST->hasVInstructions() && LT.second.isVector()) {
+      unsigned Op;
+      switch (ICA.getID()) {
+      case Intrinsic::fabs:
+        Op = RISCV::VFSGNJX_VV;
+        break;
+      case Intrinsic::sqrt:
+        Op = RISCV::VFSQRT_V;
+        break;
+      }
+      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+    }
     break;
   }
   case Intrinsic::cttz:
   case Intrinsic::ctlz:
   case Intrinsic::ctpop: {
     auto LT = getTypeLegalizationCost(RetTy);
-    if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
-      return LT.first;
+    if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
+      unsigned Op;
+      switch (ICA.getID()) {
+      case Intrinsic::cttz:
+        Op = RISCV::VCTZ_V;
+        break;
+      case Intrinsic::ctlz:
+        Op = RISCV::VCLZ_V;
+        break;
+      case Intrinsic::ctpop:
+        Op = RISCV::VCPOP_V;
+        break;
+      }
+      return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+    }
     break;
   }
   case Intrinsic::abs: {
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 8210e20ce5b10..4012bd7696c45 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -45,6 +45,7 @@ using namespace llvm;
 namespace {
 class SPIRVAsmPrinter : public AsmPrinter {
   unsigned NLabels = 0;
+  SmallPtrSet<const MachineBasicBlock *, 8> LabeledMBB;
 
 public:
   explicit SPIRVAsmPrinter(TargetMachine &TM,
@@ -152,13 +153,9 @@ void SPIRVAsmPrinter::outputOpFunctionEnd() {
   outputMCInst(FunctionEndInst);
 }
 
-// Emit OpFunctionEnd at the end of MF and clear BBNumToRegMap.
 void SPIRVAsmPrinter::emitFunctionBodyEnd() {
-  // Do not emit anything if it's an internal service function.
-  if (isHidden())
-    return;
-  outputOpFunctionEnd();
-  MAI->BBNumToRegMap.clear();
+  if (!isHidden())
+    outputOpFunctionEnd();
 }
 
 void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) {
@@ -171,6 +168,7 @@ void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) {
   LabelInst.addOperand(MCOperand::createReg(MAI->getOrCreateMBBRegister(MBB)));
   outputMCInst(LabelInst);
   ++NLabels;
+  LabeledMBB.insert(&MBB);
 }
 
 void SPIRVAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
@@ -267,7 +265,7 @@ void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   // Output OpLabel after OpFunction and OpFunctionParameter in the first MBB.
   const MachineInstr *NextMI = MI->getNextNode();
-  if (!MAI->hasMBBRegister(*MI->getParent()) && isFuncOrHeaderInstr(MI, TII) &&
+  if (!LabeledMBB.contains(MI->getParent()) && isFuncOrHeaderInstr(MI, TII) &&
       (!NextMI || !isFuncOrHeaderInstr(NextMI, TII))) {
     assert(MI->getParent()->getNumber() == MF->front().getNumber() &&
            "OpFunction is not in the front MBB of MF");
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index a1684b87722cb..45a49674d4ca2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -91,6 +91,7 @@ struct IntelSubgroupsBuiltin {
   uint32_t Opcode;
   bool IsBlock;
   bool IsWrite;
+  bool IsMedia;
 };
 
 #define GET_IntelSubgroupsBuiltins_DECL
@@ -215,6 +216,8 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall) {
   // - "__spirv_ReadClockKHR"
   // - "__spirv_SubgroupBlockReadINTEL"
   // - "__spirv_SubgroupImageBlockReadINTEL"
+  // - "__spirv_SubgroupImageMediaBlockReadINTEL"
+  // - "__spirv_SubgroupImageMediaBlockWriteINTEL"
   // - "__spirv_Convert"
   // - "__spirv_UConvert"
   // - "__spirv_SConvert"
@@ -225,7 +228,9 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall) {
   static const std::regex SpvWithR(
       "(__spirv_(ImageSampleExplicitLod|ImageRead|ImageQuerySizeLod|UDotKHR|"
       "SDotKHR|SUDotKHR|SDotAccSatKHR|UDotAccSatKHR|SUDotAccSatKHR|"
-      "ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|Convert|"
+      "ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|"
+      "SubgroupImageMediaBlockReadINTEL|SubgroupImageMediaBlockWriteINTEL|"
+      "Convert|"
       "UConvert|SConvert|FConvert|SatConvert).*)_R.*");
   std::smatch Match;
   if (std::regex_match(BuiltinName, Match, SpvWithR) && Match.size() > 2)
@@ -1192,19 +1197,28 @@ static bool generateIntelSubgroupsInst(const SPIRV::IncomingCall *Call,
   const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
   MachineFunction &MF = MIRBuilder.getMF();
   const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
-  if (!ST->canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
+  const SPIRV::IntelSubgroupsBuiltin *IntelSubgroups =
+      SPIRV::lookupIntelSubgroupsBuiltin(Builtin->Name);
+
+  if (IntelSubgroups->IsMedia &&
+      !ST->canUseExtension(SPIRV::Extension::SPV_INTEL_media_block_io)) {
+    std::string DiagMsg = std::string(Builtin->Name) +
+                          ": the builtin requires the following SPIR-V "
+                          "extension: SPV_INTEL_media_block_io";
+    report_fatal_error(DiagMsg.c_str(), false);
+  } else if (!IntelSubgroups->IsMedia &&
+             !ST->canUseExtension(SPIRV::Extension::SPV_INTEL_subgroups)) {
     std::string DiagMsg = std::string(Builtin->Name) +
                           ": the builtin requires the following SPIR-V "
                           "extension: SPV_INTEL_subgroups";
     report_fatal_error(DiagMsg.c_str(), false);
   }
-  const SPIRV::IntelSubgroupsBuiltin *IntelSubgroups =
-      SPIRV::lookupIntelSubgroupsBuiltin(Builtin->Name);
 
   uint32_t OpCode = IntelSubgroups->Opcode;
   if (Call->isSpirvOp()) {
     bool IsSet = OpCode != SPIRV::OpSubgroupBlockWriteINTEL &&
-                 OpCode != SPIRV::OpSubgroupImageBlockWriteINTEL;
+                 OpCode != SPIRV::OpSubgroupImageBlockWriteINTEL &&
+                 OpCode != SPIRV::OpSubgroupImageMediaBlockWriteINTEL;
     return buildOpFromWrapper(MIRBuilder, OpCode, Call,
                               IsSet ? GR->getSPIRVTypeID(Call->ReturnType)
                                     : Register(0));
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 43f4e78c8469c..dc2da4a3a5647 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1156,14 +1156,19 @@ class IntelSubgroupsBuiltin<string name, Op operation> {
   string Name = name;
   Op Opcode = operation;
   bit IsBlock = !or(!eq(operation, OpSubgroupBlockReadINTEL),
-                    !eq(operation, OpSubgroupBlockWriteINTEL));
-  bit IsWrite = !eq(operation, OpSubgroupBlockWriteINTEL);
+                    !eq(operation, OpSubgroupBlockWriteINTEL),
+                    !eq(operation, OpSubgroupImageMediaBlockReadINTEL),
+                    !eq(operation, OpSubgroupImageMediaBlockWriteINTEL));
+  bit IsWrite = !or(!eq(operation, OpSubgroupBlockWriteINTEL),
+                    !eq(operation, OpSubgroupImageMediaBlockWriteINTEL));
+  bit IsMedia = !or(!eq(operation, OpSubgroupImageMediaBlockReadINTEL),
+                    !eq(operation, OpSubgroupImageMediaBlockWriteINTEL));
 }
 
 // Table gathering all the Intel sub group builtins.
 def IntelSubgroupsBuiltins : GenericTable {
   let FilterClass = "IntelSubgroupsBuiltin";
-  let Fields = ["Name", "Opcode", "IsBlock", "IsWrite"];
+  let Fields = ["Name", "Opcode", "IsBlock", "IsWrite", "IsMedia"];
 }
 
 // Function to lookup group builtins by their name and set.
@@ -1191,17 +1196,24 @@ foreach i = ["", "2", "4", "8"] in {
   // cl_intel_subgroups_short
   defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_ui",  i), 1, 2, OpSubgroupBlockReadINTEL>;
   defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_ui", i), 2, 3, OpSubgroupBlockWriteINTEL>;
+  // cl_intel_media_block_io
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_read",  i), 4, 4, OpSubgroupImageMediaBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_read_ui",  i), 4, 4, OpSubgroupImageMediaBlockReadINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_write", i), 5, 5, OpSubgroupImageMediaBlockWriteINTEL>;
+  defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_write_ui", i), 5, 5, OpSubgroupImageMediaBlockWriteINTEL>;
 }
-// cl_intel_subgroups_char, cl_intel_subgroups_short, cl_intel_subgroups_long
+// cl_intel_subgroups_char, cl_intel_subgroups_short, cl_intel_subgroups_long, cl_intel_media_block_io
 foreach i = ["", "2", "4", "8", "16"] in {
   foreach j = ["c", "s", "l"] in {
     defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_read_u", j,  i), 1, 2, OpSubgroupBlockReadINTEL>;
     defm : DemangledIntelSubgroupsBuiltin<!strconcat("block_write_u", j, i), 2, 3, OpSubgroupBlockWriteINTEL>;
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_read_u", j, i), 4, 4, OpSubgroupImageMediaBlockReadINTEL>;
+    defm : DemangledIntelSubgroupsBuiltin<!strconcat("media_block_write_u", j, i), 5, 5, OpSubgroupImageMediaBlockWriteINTEL>;
   }
 }
 // OpSubgroupImageBlockReadINTEL and OpSubgroupImageBlockWriteINTEL are to be resolved later on (in code)
 
-// Multiclass used to define builtin wrappers for the SPV_INTEL_subgroups extension.
+// Multiclass used to define builtin wrappers for the SPV_INTEL_subgroups and the SPV_INTEL_media_block_io extensions.
 multiclass DemangledIntelSubgroupsBuiltinWrapper<string name, bits<8> numArgs, Op operation> {
   def : DemangledBuiltin<!strconcat("__spirv_", name), OpenCL_std, IntelSubgroups, numArgs, numArgs>;
   def : IntelSubgroupsBuiltin<!strconcat("__spirv_", name), operation>;
@@ -1215,6 +1227,8 @@ defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupBlockReadINTEL", 1, OpSubg
 defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupBlockWriteINTEL", 2, OpSubgroupBlockWriteINTEL>;
 defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupImageBlockReadINTEL", 2, OpSubgroupImageBlockReadINTEL>;
 defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupImageBlockWriteINTEL", 3, OpSubgroupImageBlockWriteINTEL>;
+defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupImageMediaBlockReadINTEL", 4, OpSubgroupImageMediaBlockReadINTEL>;
+defm : DemangledIntelSubgroupsBuiltinWrapper<"SubgroupImageMediaBlockWriteINTEL", 5, OpSubgroupImageMediaBlockWriteINTEL>;
 
 //===----------------------------------------------------------------------===//
 // Class defining a builtin for group operations within uniform control flow.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 3fdaa6aa3257e..e8e853c5c758a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -65,7 +65,8 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 }
 
 // Based on the LLVM function attributes, get a SPIR-V FunctionControl.
-static uint32_t getFunctionControl(const Function &F) {
+static uint32_t getFunctionControl(const Function &F,
+                                   const SPIRVSubtarget *ST) {
   MemoryEffects MemEffects = F.getMemoryEffects();
 
   uint32_t FuncControl = static_cast<uint32_t>(SPIRV::FunctionControl::None);
@@ -80,6 +81,11 @@ static uint32_t getFunctionControl(const Function &F) {
   else if (MemEffects.onlyReadsMemory())
     FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::Const);
 
+  if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_optnone) ||
+      ST->canUseExtension(SPIRV::Extension::SPV_EXT_optnone))
+    if (F.hasFnAttribute(Attribute::OptimizeNone))
+      FuncControl |= static_cast<uint32_t>(SPIRV::FunctionControl::OptNoneEXT);
+
   return FuncControl;
 }
 
@@ -346,6 +352,12 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
         buildOpDecorate(VRegs[i][0], MIRBuilder,
                         SPIRV::Decoration::FuncParamAttr, {Attr});
       }
+      if (Arg.hasAttribute(Attribute::StructRet)) {
+        auto Attr =
+            static_cast<unsigned>(SPIRV::FunctionParameterAttribute::Sret);
+        buildOpDecorate(VRegs[i][0], MIRBuilder,
+                        SPIRV::Decoration::FuncParamAttr, {Attr});
+      }
 
       if (F.getCallingConv() == CallingConv::SPIR_KERNEL) {
         std::vector<SPIRV::Decoration::Decoration> ArgTypeQualDecs =
@@ -397,7 +409,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   FTy = fixFunctionTypeIfPtrArgs(GR, F, FTy, RetTy, ArgTypeVRegs);
   SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
       FTy, RetTy, ArgTypeVRegs, MIRBuilder);
-  uint32_t FuncControl = getFunctionControl(F);
+  uint32_t FuncControl = getFunctionControl(F, ST);
 
   // Add OpFunction instruction
   MachineInstrBuilder MB = MIRBuilder.buildInstr(SPIRV::OpFunction)
@@ -427,10 +439,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   // Handle entry points and function linkage.
   if (isEntryPoint(F)) {
-    const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>();
-    auto executionModel = getExecutionModel(STI, F);
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpEntryPoint)
-                   .addImm(static_cast<uint32_t>(executionModel))
+                   .addImm(static_cast<uint32_t>(getExecutionModel(*ST, F)))
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 186bccc481a8a..e78fc5ce18707 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -42,12 +42,15 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_INTEL_global_variable_host_access",
          SPIRV::Extension::Extension::SPV_INTEL_global_variable_host_access},
         {"SPV_INTEL_optnone", SPIRV::Extension::Extension::SPV_INTEL_optnone},
+        {"SPV_EXT_optnone", SPIRV::Extension::Extension::SPV_EXT_optnone},
         {"SPV_INTEL_usm_storage_classes",
          SPIRV::Extension::Extension::SPV_INTEL_usm_storage_classes},
         {"SPV_INTEL_split_barrier",
          SPIRV::Extension::Extension::SPV_INTEL_split_barrier},
         {"SPV_INTEL_subgroups",
          SPIRV::Extension::Extension::SPV_INTEL_subgroups},
+        {"SPV_INTEL_media_block_io",
+         SPIRV::Extension::Extension::SPV_INTEL_media_block_io},
         {"SPV_KHR_uniform_group_instructions",
          SPIRV::Extension::Extension::SPV_KHR_uniform_group_instructions},
         {"SPV_KHR_no_integer_wrap_decoration",
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e6f136cc81b4b..f45bdfc7aacb7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -199,6 +199,8 @@ class SPIRVEmitIntrinsics
                             DenseMap<Function *, CallInst *> Ptrcasts);
 
   void replaceAllUsesWith(Value *Src, Value *Dest, bool DeleteOld = true);
+  void replaceAllUsesWithAndErase(IRBuilder<> &B, Instruction *Src,
+                                  Instruction *Dest, bool DeleteOld = true);
 
   bool runOnFunction(Function &F);
   bool postprocessTypes(Module &M);
@@ -322,6 +324,17 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
                        false);
 }
 
+static void emitAssignName(Instruction *I, IRBuilder<> &B) {
+  if (!I->hasName() || I->getType()->isAggregateType() ||
+      expectIgnoredInIRTranslation(I))
+    return;
+  reportFatalOnTokenType(I);
+  setInsertPointAfterDef(B, I);
+  std::vector<Value *> Args = {I};
+  addStringImm(I->getName(), B, Args);
+  B.CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args);
+}
+
 void SPIRVEmitIntrinsics::replaceAllUsesWith(Value *Src, Value *Dest,
                                              bool DeleteOld) {
   Src->replaceAllUsesWith(Dest);
@@ -336,6 +349,19 @@ void SPIRVEmitIntrinsics::replaceAllUsesWith(Value *Src, Value *Dest,
   }
 }
 
+void SPIRVEmitIntrinsics::replaceAllUsesWithAndErase(IRBuilder<> &B,
+                                                     Instruction *Src,
+                                                     Instruction *Dest,
+                                                     bool DeleteOld) {
+  replaceAllUsesWith(Src, Dest, DeleteOld);
+  std::string Name = Src->hasName() ? Src->getName().str() : "";
+  Src->eraseFromParent();
+  if (!Name.empty()) {
+    Dest->setName(Name);
+    emitAssignName(Dest, B);
+  }
+}
+
 static bool IsKernelArgInt8(Function *F, StoreInst *SI) {
   return SI && F->getCallingConv() == CallingConv::SPIR_KERNEL &&
          isPointerTy(SI->getValueOperand()->getType()) &&
@@ -475,7 +501,7 @@ void SPIRVEmitIntrinsics::propagateElemType(
   DenseMap<Function *, CallInst *> Ptrcasts;
   SmallVector<User *> Users(Op->users());
   for (auto *U : Users) {
-    if (!isa<Instruction>(U) || isa<BitCastInst>(U) || isSpvIntrinsic(U))
+    if (!isa<Instruction>(U) || isSpvIntrinsic(U))
       continue;
     if (!VisitedSubst.insert(std::make_pair(U, Op)).second)
       continue;
@@ -506,7 +532,7 @@ void SPIRVEmitIntrinsics::propagateElemTypeRec(
     return;
   SmallVector<User *> Users(Op->users());
   for (auto *U : Users) {
-    if (!isa<Instruction>(U) || isa<BitCastInst>(U) || isSpvIntrinsic(U))
+    if (!isa<Instruction>(U) || isSpvIntrinsic(U))
       continue;
     if (!VisitedSubst.insert(std::make_pair(U, Op)).second)
       continue;
@@ -958,6 +984,14 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
       return;
     Uncomplete = isTodoType(I);
     Ops.push_back(std::make_pair(Ref->getPointerOperand(), 0));
+  } else if (auto *Ref = dyn_cast<BitCastInst>(I)) {
+    if (!isPointerTy(I->getType()))
+      return;
+    KnownElemTy = GR->findDeducedElementType(I);
+    if (!KnownElemTy)
+      return;
+    Uncomplete = isTodoType(I);
+    Ops.push_back(std::make_pair(Ref->getOperand(0), 0));
   } else if (auto *Ref = dyn_cast<GetElementPtrInst>(I)) {
     if (GR->findDeducedElementType(Ref->getPointerOperand()))
       return;
@@ -1030,7 +1064,6 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
             }
           }
         }
-        TypeValidated.insert(I);
         // Non-recursive update of types in the function uncomplete returns.
         // This may happen just once per a function, the latch is a pair of
         // findDeducedElementType(F) / addDeducedElementType(F, ...).
@@ -1043,6 +1076,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(
       } else if (UncompleteRets) {
         UncompleteRets->insert(I);
       }
+      TypeValidated.insert(I);
       return;
     }
     Uncomplete = isTodoType(CurrF);
@@ -1300,8 +1334,7 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {
   for (auto &Op : I.operands())
     Args.push_back(Op);
   auto *NewI = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1323,10 +1356,7 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
   SmallVector<Type *, 2> Types = {I.getType(), Source->getType()};
   SmallVector<Value *> Args(I.op_begin(), I.op_end());
   auto *NewI = B.CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args});
-  std::string InstName = I.hasName() ? I.getName().str() : "";
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
-  NewI->setName(InstName);
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1369,10 +1399,6 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
     Instruction *I, Value *Pointer, Type *ExpectedElementType,
     unsigned OperandToReplace, IRBuilder<> &B) {
   TypeValidated.insert(I);
-  // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source
-  // pointer instead. The BitCastInst should be later removed when visited.
-  while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
-    Pointer = BC->getOperand(0);
 
   // Do not emit spv_ptrcast if Pointer's element type is ExpectedElementType
   Type *PointerElemTy = deduceElementTypeHelper(Pointer, false);
@@ -1585,10 +1611,7 @@ Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) {
   B.SetInsertPoint(&I);
   SmallVector<Value *> Args(I.op_begin(), I.op_end());
   auto *NewI = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
-  std::string InstName = I.hasName() ? I.getName().str() : "";
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
-  NewI->setName(InstName);
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1600,10 +1623,7 @@ SPIRVEmitIntrinsics::visitExtractElementInst(ExtractElementInst &I) {
                                   I.getIndexOperand()->getType()};
   SmallVector<Value *, 2> Args = {I.getVectorOperand(), I.getIndexOperand()};
   auto *NewI = B.CreateIntrinsic(Intrinsic::spv_extractelt, {Types}, {Args});
-  std::string InstName = I.hasName() ? I.getName().str() : "";
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
-  NewI->setName(InstName);
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1637,8 +1657,7 @@ Instruction *SPIRVEmitIntrinsics::visitExtractValueInst(ExtractValueInst &I) {
     Args.push_back(B.getInt32(Op));
   auto *NewI =
       B.CreateIntrinsic(Intrinsic::spv_extractv, {I.getType()}, {Args});
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1697,10 +1716,7 @@ Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
       ArraySize ? B.CreateIntrinsic(Intrinsic::spv_alloca_array,
                                     {PtrTy, ArraySize->getType()}, {ArraySize})
                 : B.CreateIntrinsic(Intrinsic::spv_alloca, {PtrTy}, {});
-  std::string InstName = I.hasName() ? I.getName().str() : "";
-  replaceAllUsesWith(&I, NewI);
-  I.eraseFromParent();
-  NewI->setName(InstName);
+  replaceAllUsesWithAndErase(B, &I, NewI);
   return NewI;
 }
 
@@ -1759,8 +1775,7 @@ bool SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
                                                    IRBuilder<> &B,
                                                    bool UnknownElemTypeI8) {
   reportFatalOnTokenType(I);
-  if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
-      isa<BitCastInst>(I))
+  if (!isPointerTy(I->getType()) || !requireAssignType(I))
     return false;
 
   setInsertPointAfterDef(B, I);
@@ -1861,8 +1876,9 @@ void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
 void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                                                  IRBuilder<> &B) {
   auto *II = dyn_cast<IntrinsicInst>(I);
-  if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite &&
-      TrackConstants) {
+  bool IsConstComposite =
+      II && II->getIntrinsicID() == Intrinsic::spv_const_composite;
+  if (IsConstComposite && TrackConstants) {
     setInsertPointAfterDef(B, I);
     auto t = AggrConsts.find(I);
     assert(t != AggrConsts.end());
@@ -1886,23 +1902,31 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
               : B.SetInsertPoint(I);
         BPrepared = true;
       }
+      Type *OpTy = Op->getType();
       Value *OpTyVal = Op;
-      if (Op->getType()->isTargetExtTy())
-        OpTyVal = PoisonValue::get(Op->getType());
-      auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
-                                    {Op->getType(), OpTyVal->getType()}, Op,
-                                    OpTyVal, {}, B);
+      if (OpTy->isTargetExtTy())
+        OpTyVal = PoisonValue::get(OpTy);
+      CallInst *NewOp =
+          buildIntrWithMD(Intrinsic::spv_track_constant,
+                          {OpTy, OpTyVal->getType()}, Op, OpTyVal, {}, B);
+      Type *OpElemTy = nullptr;
+      if (!IsConstComposite && isPointerTy(OpTy) &&
+          (OpElemTy = GR->findDeducedElementType(Op)) != nullptr &&
+          OpElemTy != IntegerType::getInt8Ty(I->getContext())) {
+        buildAssignPtr(B, IntegerType::getInt8Ty(I->getContext()), NewOp);
+        SmallVector<Type *, 2> Types = {OpTy, OpTy};
+        SmallVector<Value *, 2> Args = {
+            NewOp, buildMD(PoisonValue::get(OpElemTy)),
+            B.getInt32(getPointerAddressSpace(OpTy))};
+        CallInst *PtrCasted =
+            B.CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args);
+        buildAssignPtr(B, OpElemTy, PtrCasted);
+        NewOp = PtrCasted;
+      }
       I->setOperand(OpNo, NewOp);
     }
   }
-  if (I->hasName() && !I->getType()->isAggregateType() &&
-      !expectIgnoredInIRTranslation(I)) {
-    reportFatalOnTokenType(I);
-    setInsertPointAfterDef(B, I);
-    std::vector<Value *> Args = {I};
-    addStringImm(I->getName(), B, Args);
-    B.CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args);
-  }
+  emitAssignName(I, B);
 }
 
 Type *SPIRVEmitIntrinsics::deduceFunParamElementType(Function *F,
@@ -2022,8 +2046,16 @@ void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
     if (!isUntypedPointerTy(Arg->getType()))
       continue;
     Type *ElemTy = GR->findDeducedElementType(Arg);
-    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr)
-      buildAssignPtr(B, ElemTy, Arg);
+    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr) {
+      if (CallInst *AssignCI = GR->findAssignPtrTypeInstr(Arg)) {
+        DenseSet<std::pair<Value *, Value *>> VisitedSubst;
+        updateAssignType(AssignCI, Arg, PoisonValue::get(ElemTy));
+        propagateElemType(Arg, IntegerType::getInt8Ty(F->getContext()),
+                          VisitedSubst);
+      } else {
+        buildAssignPtr(B, ElemTy, Arg);
+      }
+    }
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
index d3e323efaee91..b98cef0a4f07f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
@@ -268,7 +268,7 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
     // We aren't extracting any DebugInfoFlags now so we
     // emitting zero to use as <id>Flags argument for DebugBasicType
     const Register I32ZeroReg =
-        GR->buildConstantInt(0, MIRBuilder, I32Ty, false);
+        GR->buildConstantInt(0, MIRBuilder, I32Ty, false, false);
 
     // We need to store pairs because further instructions reference
     // the DIBasicTypes and size will be always small so there isn't
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 9ac659f6b4f11..91b9cbcf15128 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -325,8 +325,8 @@ Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I,
 
 Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
                                                MachineIRBuilder &MIRBuilder,
-                                               SPIRVType *SpvType,
-                                               bool EmitIR) {
+                                               SPIRVType *SpvType, bool EmitIR,
+                                               bool ZeroAsNull) {
   assert(SpvType);
   auto &MF = MIRBuilder.getMF();
   const IntegerType *LLVMIntTy =
@@ -348,7 +348,7 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
     } else {
       Register SpvTypeReg = getSPIRVTypeID(SpvType);
       MachineInstrBuilder MIB;
-      if (Val) {
+      if (Val || !ZeroAsNull) {
         MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
                   .addDef(Res)
                   .addUse(SpvTypeReg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ff4b0ea8757fa..df92325ed1980 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -509,7 +509,8 @@ class SPIRVGlobalRegistry {
 
 public:
   Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
-                            SPIRVType *SpvType, bool EmitIR = true);
+                            SPIRVType *SpvType, bool EmitIR = true,
+                            bool ZeroAsNull = true);
   Register getOrCreateConstInt(uint64_t Val, MachineInstr &I,
                                SPIRVType *SpvType, const SPIRVInstrInfo &TII,
                                bool ZeroAsNull = true);
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index fde23d9d0ca5f..53f1b644a9498 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -850,6 +850,12 @@ def OpSubgroupImageBlockReadINTEL: Op<5577, (outs ID:$res), (ins TYPE:$type, ID:
 def OpSubgroupImageBlockWriteINTEL: Op<5578, (outs), (ins ID:$image, ID:$coordinate, ID:$data),
                   "OpSubgroupImageBlockWriteINTEL $image $coordinate $data">;
 
+// SPV_INTEL_media_block_io
+def OpSubgroupImageMediaBlockReadINTEL: Op<5580, (outs ID:$res), (ins TYPE:$type, ID:$image, ID:$coordinate, ID:$width, ID:$height),
+                  "$res = OpSubgroupImageMediaBlockReadINTEL $type $image $coordinate $width $height">;
+def OpSubgroupImageMediaBlockWriteINTEL: Op<5581, (outs), (ins ID:$image, ID:$coordinate, ID:$width, ID:$height, ID:$data),
+                  "OpSubgroupImageMediaBlockWriteINTEL $image $coordinate $width $height $data">;
+
 // - SPV_KHR_uniform_group_instructions
 def OpGroupIMulKHR: Op<6401, (outs ID:$res), (ins TYPE:$type, ID:$scope, i32imm:$groupOp, ID:$value),
                   "$res = OpGroupIMulKHR $type $scope $groupOp $value">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index d0335117cbe12..3547ac66430a8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1460,6 +1460,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
         .addUse(SrcPtr)
         .constrainAllUses(TII, TRI, RBI);
 
+  if ((SrcSC == SPIRV::StorageClass::Function &&
+       DstSC == SPIRV::StorageClass::Private) ||
+      (DstSC == SPIRV::StorageClass::Function &&
+       SrcSC == SPIRV::StorageClass::Private)) {
+    return BuildMI(BB, I, DL, TII.get(TargetOpcode::COPY))
+        .addDef(ResVReg)
+        .addUse(SrcPtr)
+        .constrainAllUses(TII, TRI, RBI);
+  }
+
   // Casting from an eligible pointer to Generic.
   if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
@@ -3461,11 +3471,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   if (HasInit && !Init)
     return true;
 
-  unsigned AddrSpace = GV->getAddressSpace();
-  SPIRV::StorageClass::StorageClass Storage =
-      addressSpaceToStorageClass(AddrSpace, STI);
-  bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
-                  Storage != SPIRV::StorageClass::Function;
+  bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage;
   SPIRV::LinkageType::LinkageType LnkType =
       (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
           ? SPIRV::LinkageType::Import
@@ -3474,12 +3480,14 @@ bool SPIRVInstructionSelector::selectGlobalValue(
                  ? SPIRV::LinkageType::LinkOnceODR
                  : SPIRV::LinkageType::Export);
 
-  SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(
-      PointerBaseType, I, TII,
-      addressSpaceToStorageClass(GV->getAddressSpace(), STI));
-  Register Reg = GR.buildGlobalVariable(ResVReg, ResType, GlobalIdent, GV,
-                                        Storage, Init, GlobalVar->isConstant(),
-                                        HasLnkTy, LnkType, MIRBuilder, true);
+  const unsigned AddrSpace = GV->getAddressSpace();
+  SPIRV::StorageClass::StorageClass StorageClass =
+      addressSpaceToStorageClass(AddrSpace, STI);
+  SPIRVType *ResType =
+      GR.getOrCreateSPIRVPointerType(PointerBaseType, I, TII, StorageClass);
+  Register Reg = GR.buildGlobalVariable(
+      ResVReg, ResType, GlobalIdent, GV, StorageClass, Init,
+      GlobalVar->isConstant(), HasLnkTy, LnkType, MIRBuilder, true);
   return Reg.isValid();
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 90898b8bd7250..7230e0e6b9fca 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -112,13 +112,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT p5 =
       LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device)
   const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host)
+  const LLT p7 = LLT::pointer(7, PSize); // Input
+  const LLT p8 = LLT::pointer(8, PSize); // Output
+  const LLT p10 = LLT::pointer(10, PSize); // Private
 
   // TODO: remove copy-pasting here by using concatenation in some way.
   auto allPtrsScalarsAndVectors = {
-      p0,    p1,    p2,    p3,    p4,     p5,     p6,    s1,   s8,   s16,
-      s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64, v3s1, v3s8, v3s16,
-      v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64, v8s1, v8s8, v8s16,
-      v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+      p0,   p1,   p2,    p3,    p4,    p5,    p6,    p7,     p8,     p10,
+      s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
+      v3s1, v3s8, v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64,
+      v8s1, v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
 
   auto allVectors = {v2s1,  v2s8,   v2s16,  v2s32, v2s64, v3s1,  v3s8,
                      v3s16, v3s32,  v3s64,  v4s1,  v4s8,  v4s16, v4s32,
@@ -145,10 +148,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
       s16,   s32,   s64,   v2s16, v2s32, v2s64, v3s16,  v3s32,  v3s64,
       v4s16, v4s32, v4s64, v8s16, v8s32, v8s64, v16s16, v16s32, v16s64};
 
-  auto allFloatAndIntScalarsAndPtrs = {s8, s16, s32, s64, p0, p1,
-                                       p2, p3,  p4,  p5,  p6};
+  auto allFloatAndIntScalarsAndPtrs = {s8, s16, s32, s64, p0, p1, p2,
+                                       p3, p4,  p5,  p6,  p7, p8, p10};
 
-  auto allPtrs = {p0, p1, p2, p3, p4, p5, p6};
+  auto allPtrs = {p0, p1, p2, p3, p4, p5, p6, p7, p8, p10};
 
   bool IsExtendedInts =
       ST.canUseExtension(
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index d9f928eb90640..2054081476315 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -421,6 +421,7 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
       continue;
     MachineFunction *MF = MMI->getMachineFunction(*F);
     assert(MF);
+
     for (MachineBasicBlock &MBB : *MF)
       for (MachineInstr &MI : MBB) {
         if (MAI.getSkipEmission(&MI))
@@ -1320,6 +1321,13 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::SubgroupImageBlockIOINTEL);
     }
     break;
+  case SPIRV::OpSubgroupImageMediaBlockReadINTEL:
+  case SPIRV::OpSubgroupImageMediaBlockWriteINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_media_block_io)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_media_block_io);
+      Reqs.addCapability(SPIRV::Capability::SubgroupImageMediaBlockIOINTEL);
+    }
+    break;
   case SPIRV::OpAssumeTrueKHR:
   case SPIRV::OpExpectKHR:
     if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume)) {
@@ -1541,11 +1549,14 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           SPIRV::OperandCategory::ExecutionModeOperand,
           SPIRV::ExecutionMode::VecTypeHint, ST);
 
-    if (F.hasOptNone() &&
-        ST.canUseExtension(SPIRV::Extension::SPV_INTEL_optnone)) {
-      // Output OpCapability OptNoneINTEL.
-      MAI.Reqs.addExtension(SPIRV::Extension::SPV_INTEL_optnone);
-      MAI.Reqs.addCapability(SPIRV::Capability::OptNoneINTEL);
+    if (F.hasOptNone()) {
+      if (ST.canUseExtension(SPIRV::Extension::SPV_EXT_optnone)) {
+        MAI.Reqs.addExtension(SPIRV::Extension::SPV_EXT_optnone);
+        MAI.Reqs.addCapability(SPIRV::Capability::OptNoneEXT);
+      } else if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_optnone)) {
+        MAI.Reqs.addExtension(SPIRV::Extension::SPV_INTEL_optnone);
+        MAI.Reqs.addCapability(SPIRV::Capability::OptNoneINTEL);
+      }
     }
   }
 }
@@ -1606,6 +1617,27 @@ static void addDecorations(const Module &M, const SPIRVInstrInfo &TII,
   }
 }
 
+static void addMBBNames(const Module &M, const SPIRVInstrInfo &TII,
+                        MachineModuleInfo *MMI, const SPIRVSubtarget &ST,
+                        SPIRV::ModuleAnalysisInfo &MAI) {
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    MachineFunction *MF = MMI->getMachineFunction(*F);
+    if (!MF)
+      continue;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    for (auto &MBB : *MF) {
+      if (!MBB.hasName() || MBB.empty())
+        continue;
+      // Emit basic block names.
+      Register Reg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+      MRI.setRegClass(Reg, &SPIRV::IDRegClass);
+      buildOpName(Reg, MBB.getName(), *std::prev(MBB.end()), TII);
+      Register GlobalReg = MAI.getOrCreateMBBRegister(MBB);
+      MAI.setRegisterAlias(MF, Reg, GlobalReg);
+    }
+  }
+}
+
 struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
 
 void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -1624,6 +1656,7 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
 
   setBaseInfo(M);
 
+  addMBBNames(M, *TII, MMI, *ST, MAI);
   addDecorations(M, *TII, MMI, *ST, MAI);
 
   collectReqs(M, MAI, MMI, *ST);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 024728c347e8a..ee2aaf156aa89 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -157,7 +157,7 @@ struct ModuleAnalysisInfo {
   // The array contains lists of MIs for each module section.
   InstrList MS[NUM_MODULE_SECTIONS];
   // The table maps MBB number to SPIR-V unique ID register.
-  DenseMap<int, Register> BBNumToRegMap;
+  DenseMap<std::pair<const MachineFunction *, int>, Register> BBNumToRegMap;
 
   Register getFuncReg(const Function *F) {
     assert(F && "Function is null");
@@ -188,15 +188,17 @@ struct ModuleAnalysisInfo {
   }
   unsigned getNextID() { return MaxID++; }
   bool hasMBBRegister(const MachineBasicBlock &MBB) {
-    return BBNumToRegMap.contains(MBB.getNumber());
+    auto Key = std::make_pair(MBB.getParent(), MBB.getNumber());
+    return BBNumToRegMap.contains(Key);
   }
   // Convert MBB's number to corresponding ID register.
   Register getOrCreateMBBRegister(const MachineBasicBlock &MBB) {
-    auto f = BBNumToRegMap.find(MBB.getNumber());
-    if (f != BBNumToRegMap.end())
-      return f->second;
+    auto Key = std::make_pair(MBB.getParent(), MBB.getNumber());
+    auto It = BBNumToRegMap.find(Key);
+    if (It != BBNumToRegMap.end())
+      return It->second;
     Register NewReg = Register::index2VirtReg(getNextID());
-    BBNumToRegMap[MBB.getNumber()] = NewReg;
+    BBNumToRegMap[Key] = NewReg;
     return NewReg;
   }
 };
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index b88f6f5766a05..a3a88acdd6c6a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -304,6 +304,7 @@ defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
 defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
 defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
 defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
+defm SPV_EXT_optnone : ExtensionOperand<113>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -446,7 +447,7 @@ defm RayTracingNV : CapabilityOperand<5340, 0, 0, [], [Shader]>;
 defm SubgroupShuffleINTEL : CapabilityOperand<5568, 0, 0, [SPV_INTEL_subgroups], []>;
 defm SubgroupBufferBlockIOINTEL : CapabilityOperand<5569, 0, 0, [SPV_INTEL_subgroups], []>;
 defm SubgroupImageBlockIOINTEL : CapabilityOperand<5570, 0, 0, [SPV_INTEL_subgroups], []>;
-defm SubgroupImageMediaBlockIOINTEL : CapabilityOperand<5579, 0, 0, [], []>;
+defm SubgroupImageMediaBlockIOINTEL : CapabilityOperand<5579, 0, 0, [SPV_INTEL_media_block_io], []>;
 defm SubgroupAvcMotionEstimationINTEL : CapabilityOperand<5696, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationIntraINTEL : CapabilityOperand<5697, 0, 0, [], []>;
 defm SubgroupAvcMotionEstimationChromaINTEL : CapabilityOperand<5698, 0, 0, [], []>;
@@ -463,6 +464,7 @@ defm PhysicalStorageBufferAddressesEXT : CapabilityOperand<5347, 0, 0, [], [Shad
 defm CooperativeMatrixNV : CapabilityOperand<5357, 0, 0, [], [Shader]>;
 defm ArbitraryPrecisionIntegersINTEL : CapabilityOperand<5844, 0, 0, [SPV_INTEL_arbitrary_precision_integers], [Int8, Int16]>;
 defm OptNoneINTEL : CapabilityOperand<6094, 0, 0, [SPV_INTEL_optnone], []>;
+defm OptNoneEXT : CapabilityOperand<6094, 0, 0, [SPV_EXT_optnone], []>;
 defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions], []>;
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
 defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
@@ -1433,6 +1435,7 @@ defm Inline : FunctionControlOperand<0x1>;
 defm DontInline : FunctionControlOperand<0x2>;
 defm Pure : FunctionControlOperand<0x4>;
 defm Const : FunctionControlOperand<0x8>;
+defm OptNoneEXT : FunctionControlOperand<0x10000>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define MemorySemantics enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 1ece3044aaa7b..7a1914aac8ceb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -108,6 +108,16 @@ void buildOpName(Register Target, const StringRef &Name,
   }
 }
 
+void buildOpName(Register Target, const StringRef &Name, MachineInstr &I,
+                 const SPIRVInstrInfo &TII) {
+  if (!Name.empty()) {
+    auto MIB =
+        BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpName))
+            .addUse(Target);
+    addStringImm(Name, MIB);
+  }
+}
+
 static void finishBuildOpDecorate(MachineInstrBuilder &MIB,
                                   const std::vector<uint32_t> &DecArgs,
                                   StringRef StrImm) {
@@ -207,8 +217,12 @@ addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
                : SPIRV::StorageClass::CrossWorkgroup;
   case 7:
     return SPIRV::StorageClass::Input;
+  case 8:
+    return SPIRV::StorageClass::Output;
   case 9:
     return SPIRV::StorageClass::CodeSectionINTEL;
+  case 10:
+    return SPIRV::StorageClass::Private;
   default:
     report_fatal_error("Unknown address space");
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index c0569549039d5..cc77e0afa275a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -129,6 +129,8 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB);
 // Add an OpName instruction for the given target register.
 void buildOpName(Register Target, const StringRef &Name,
                  MachineIRBuilder &MIRBuilder);
+void buildOpName(Register Target, const StringRef &Name, MachineInstr &I,
+                 const SPIRVInstrInfo &TII);
 
 // Add an OpDecorate instruction for the given Reg.
 void buildOpDecorate(Register Reg, MachineIRBuilder &MIRBuilder,
@@ -170,8 +172,12 @@ storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
     return 6;
   case SPIRV::StorageClass::Input:
     return 7;
+  case SPIRV::StorageClass::Output:
+    return 8;
   case SPIRV::StorageClass::CodeSectionINTEL:
     return 9;
+  case SPIRV::StorageClass::Private:
+    return 10;
   default:
     report_fatal_error("Unable to get address space id");
   }
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index e4aefc42d860f..599afed2199fb 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1148,9 +1148,10 @@ ParseStatus SystemZAsmParser::parseAddress(OperandVector &Operands,
     if (HaveReg1) {
       if (parseAddressRegister(Reg1))
         return ParseStatus::Failure;
-      // If the are two registers, the first one is the index and the
-      // second is the base.
-      if (HaveReg2)
+      // If there are two registers, the first one is the index and the
+      // second is the base.  If there is only a single register, it is
+      // used as base with GAS and as index with HLASM.
+      if (HaveReg2 || isParsingHLASM())
         Index = Reg1.Num == 0 ? 0 : Regs[Reg1.Num];
       else
         Base = Reg1.Num == 0 ? 0 : Regs[Reg1.Num];
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 8f505b7e198cf..975a0f5050d16 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -2147,8 +2147,8 @@ std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
   for (SDValue Op : Ops) {
     Entry.Node = Op;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
-    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, IsSigned);
+    Entry.IsZExt = !Entry.IsSExt;
     Args.push_back(Entry);
   }
 
@@ -2157,7 +2157,7 @@ std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
-  bool SignExtend = shouldSignExtendTypeInLibCall(RetVT, IsSigned);
+  bool SignExtend = shouldSignExtendTypeInLibCall(RetTy, IsSigned);
   CLI.setDebugLoc(DL)
       .setChain(Chain)
       .setCallee(CallConv, RetTy, Callee, std::move(Args))
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 6e136b10aed42..ae8f669e9bab4 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2328,6 +2328,12 @@ class AsmCondBranchRI<string mnemonic, bits<12> opcode>
   : InstRIc<opcode, (outs), (ins imm32zx4:$M1, brtarget16:$RI2),
             mnemonic#"\t$M1, $RI2", []>;
 
+class NeverCondBranchRI<string mnemonic, bits<12> opcode>
+  : InstRIc<opcode, (outs), (ins brtarget16:$RI2),
+            mnemonic#"\t$RI2", []> {
+  let M1 = 0;
+}
+
 class FixedCondBranchRI<CondVariant V, string mnemonic, bits<12> opcode,
                         SDPatternOperator operator = null_frag>
   : InstRIc<opcode, (outs), (ins brtarget16:$RI2),
@@ -2347,6 +2353,12 @@ class AsmCondBranchRIL<string mnemonic, bits<12> opcode>
   : InstRILc<opcode, (outs), (ins imm32zx4:$M1, brtarget32:$RI2),
              mnemonic#"\t$M1, $RI2", []>;
 
+class NeverCondBranchRIL<string mnemonic, bits<12> opcode>
+  : InstRILc<opcode, (outs), (ins brtarget32:$RI2),
+             mnemonic#"\t$RI2", []> {
+  let M1 = 0;
+}
+
 class FixedCondBranchRIL<CondVariant V, string mnemonic, bits<12> opcode>
   : InstRILc<opcode, (outs), (ins brtarget32:$RI2),
              !subst("#", V.suffix, mnemonic)#"\t$RI2", []> {
@@ -2365,10 +2377,16 @@ class AsmCondBranchRR<string mnemonic, bits<8> opcode>
   : InstRR<opcode, (outs), (ins imm32zx4:$R1, GR64:$R2),
            mnemonic#"\t$R1, $R2", []>;
 
-class NeverCondBranchRR<string mnemonic, bits<8> opcode>
-  : InstRR<opcode, (outs), (ins GR64:$R2),
-           mnemonic#"\t$R2", []> {
-  let R1 = 0;
+multiclass NeverCondBranchRR<string mnemonic, bits<8> opcode> {
+  // For the no-op (always false) branch, the target is optional.
+  def "" : InstRR<opcode, (outs), (ins GR64:$R2),
+                  mnemonic#"\t$R2", []> {
+             let R1 = 0;
+           }
+  def Opt : InstRR<opcode, (outs), (ins), mnemonic, []> {
+              let R1 = 0;
+              let R2 = 0;
+            }
 }
 
 class FixedCondBranchRR<CondVariant V, string mnemonic, bits<8> opcode,
@@ -2392,11 +2410,19 @@ class AsmCondBranchRX<string mnemonic, bits<8> opcode>
             (ins imm32zx4:$M1, (bdxaddr12only $B2, $D2, $X2):$XBD2),
             mnemonic#"\t$M1, $XBD2", []>;
 
-class NeverCondBranchRX<string mnemonic, bits<8> opcode> 
-  : InstRXb<opcode, (outs),
-            (ins (bdxaddr12only $B2, $D2, $X2):$XBD2),
-            mnemonic#"\t$XBD2", []> {
-  let M1 = 0;
+multiclass NeverCondBranchRX<string mnemonic, bits<8> opcode> {
+  // For the no-op (always false) branch, the target is optional.
+  def "" : InstRXb<opcode, (outs),
+                  (ins (bdxaddr12only $B2, $D2, $X2):$XBD2),
+                  mnemonic#"\t$XBD2", []> {
+             let M1 = 0;
+           }
+  def Opt : InstRXb<opcode, (outs), (ins), mnemonic, []> {
+              let M1 = 0;
+              let B2 = 0;
+              let D2 = 0;
+              let X2 = 0;
+            }
 }
 
 class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index f3baf896658de..5cbba0d9c5edd 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -109,20 +109,11 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 }
 
 // NOPs.  These are again variants of the conditional branches, with the
-// condition mask set to "never".  NOP_bare can't be an InstAlias since it
-// would need R0D hard coded which is not part of ADDR64BitRegClass.
-def NOP : NeverCondBranchRX<"nop", 0x47>;
-let isAsmParserOnly = 1, hasNoSchedulingInfo = 1, M1 = 0, X2 = 0, B2 = 0, D2 = 0 in
-  def NOP_bare  : InstRXb<0x47,(outs), (ins), "nop", []>;
-def NOPR : NeverCondBranchRR<"nopr", 0x07>;
-def NOPR_bare : InstAlias<"nopr", (NOPR R0D), 0>;
-
-// An alias of BRC 0, label
-def JNOP : InstAlias<"jnop\t$RI2", (BRCAsm 0, brtarget16:$RI2), 0>;
-
-// An alias of BRCL 0, label
-// jgnop on gnu ; jlnop on hlasm
-def JGNOP : InstAlias<"{jgnop|jlnop}\t$RI2", (BRCLAsm 0, brtarget32:$RI2), 0>;
+// condition mask set to "never".
+defm NOP  : NeverCondBranchRX<"nop", 0x47>;
+defm NOPR : NeverCondBranchRR<"nopr", 0x07>;
+def JNOP  : NeverCondBranchRI<"jnop", 0xA74>;
+def JGNOP : NeverCondBranchRIL<"j{g|l}nop", 0xC04>;
 
 // Fused compare-and-branch instructions.
 //
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index d0fec02777875..094b481b81f83 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1557,7 +1557,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>;
-
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index a6d89ce9443c5..b9376d422ded2 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1647,7 +1647,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>;
-
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 455354e283ad8..5e28bf935a24b 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1694,6 +1694,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 92abf0ba4022c..2c01691707cc3 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1727,6 +1727,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?$")>;
+def : InstRW<[WLat1, FXb, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index 99d0d674bbbb2..f41a7057bb1f6 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -1239,6 +1239,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, LSU, EndGroup], (instregex "NOP(R)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, LSU, EndGroup], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 5b334da2bac34..8f0a10d2863a3 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -1284,6 +1284,7 @@ def : InstRW<[WLat30, MCD], (instregex "SAL$")>;
 // NOPs
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[WLat1, LSU, NormalGr], (instregex "NOP(R)?$")>;
+def : InstRW<[WLat1, LSU, NormalGr], (instregex "NOP(R)?(Opt)?$")>;
+def : InstRW<[WLat1, VBU, NormalGr], (instregex "J(G)?NOP$")>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 83b42f6d1794d..772efcdf8f9fc 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/InstructionCost.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
@@ -1396,30 +1397,86 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
+InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
+  InstructionCost Cost = 0;
+  // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
+  Cost += NumVec - 1;
+  // For integer adds, VSUM creates shorter reductions on the final vector.
+  Cost += (ScalarBits < 32) ? 3 : 2;
+  return Cost;
+}
+
+InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
+                                     unsigned ScalarBits) {
+  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+  InstructionCost Cost = 0;
+  // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
+  Cost += NumVec - 1;
+  // For each shuffle / arithmetic layer, we need 2 instructions, and we need
+  // log2(Elements in Last Vector) layers.
+  Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
+  return Cost;
+}
+
+inline bool customCostReductions(unsigned Opcode) {
+  return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
+         Opcode == Instruction::Add || Opcode == Instruction::Mul;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                           std::optional<FastMathFlags> FMF,
+                                           TTI::TargetCostKind CostKind) {
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+  // The following is only for subtargets with vector math, non-ordered
+  // reductions, and reasonable scalar sizes for int and fp add/mul.
+  if (customCostReductions(Opcode) && ST->hasVector() &&
+      !TTI::requiresOrderedReduction(FMF) &&
+      ScalarBits <= SystemZ::VectorBits) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
+    // Integer Add is using custom code gen, that needs to be accounted for.
+    if (Opcode == Instruction::Add)
+      return getIntAddReductionCost(NumVectors, ScalarBits);
+    // The base cost is the same across all other arithmetic instructions
+    InstructionCost Cost =
+        getFastReductionCost(NumVectors, NumElems, ScalarBits);
+    // But we need to account for the final op involving the scalar operand.
+    if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
+      Cost += 1;
+    return Cost;
+  }
+  // otherwise, fall back to the standard implementation
+  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                       FastMathFlags FMF,
+                                       TTI::TargetCostKind CostKind) {
+  // Return custom costs only on subtargets with vector enhancements.
+  if (ST->hasVectorEnhancements1()) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+    InstructionCost Cost = 0;
+    // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
+    Cost += NumVectors - 1;
+    // For the final vector, we need shuffle + min/max operations, and
+    // we need #Elements - 1 of them.
+    Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
+    return Cost;
+  }
+  // For other targets, fall back to the standard implementation
+  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
     return getNumVectorRegs(RetTy); // VPERM
 
-  if (ID == Intrinsic::vector_reduce_add) {
-    // Retrieve number and size of elements for the vector op.
-    auto *VTy = cast<FixedVectorType>(ParamTys.front());
-    unsigned ScalarSize = VTy->getScalarSizeInBits();
-    // For scalar sizes >128 bits, we fall back to the generic cost estimate.
-    if (ScalarSize > SystemZ::VectorBits)
-      return -1;
-    // This many vector regs are needed to represent the input elements (V).
-    unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
-    // This many instructions are needed for the final sum of vector elems (S).
-    unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
-    // We use vector adds to create a sum vector, which takes
-    // V/2 + V/4 + ... = V - 1 operations.
-    // Then, we need S operations to sum up the elements of that sum vector,
-    // for a total of V + S - 1 operations.
-    int Cost = VectorRegsNeeded + LastVectorHandling - 1;
-    return Cost;
-  }
   return -1;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 6795da59bf5b1..512fcc854d532 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -130,6 +130,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             std::optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                         FastMathFlags FMF,
+                                         TTI::TargetCostKind CostKind);
+
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 10451600050ca..f693ef3dbf962 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -276,7 +276,18 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
       : MCTargetAsmParser(Options, STI, MII), Parser(Parser),
         Lexer(Parser.getLexer()), Is64(STI.getTargetTriple().isArch64Bit()),
         TC(Parser, MII, Is64), SkipTypeCheck(Options.MCNoTypeCheck) {
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    FeatureBitset FBS = ComputeAvailableFeatures(STI.getFeatureBits());
+
+    // bulk-memory implies bulk-memory-opt
+    if (FBS.test(WebAssembly::FeatureBulkMemory)) {
+      FBS.set(WebAssembly::FeatureBulkMemoryOpt);
+    }
+    // reference-types implies call-indirect-overlong
+    if (FBS.test(WebAssembly::FeatureReferenceTypes)) {
+      FBS.set(WebAssembly::FeatureCallIndirectOverlong);
+    }
+
+    setAvailableFeatures(FBS);
     // Don't type check if this is inline asm, since that is a naked sequence of
     // instructions without a function/locals decl.
     auto &SM = Parser.getSourceManager();
@@ -291,7 +302,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
     DefaultFunctionTable = getOrCreateFunctionTableSymbol(
         getContext(), "__indirect_function_table", Is64);
-    if (!STI->checkFeatures("+reference-types"))
+    if (!STI->checkFeatures("+call-indirect-overlong") &&
+        !STI->checkFeatures("+reference-types"))
       DefaultFunctionTable->setOmitFromLinkingSection();
   }
 
@@ -531,11 +543,13 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
   }
 
   bool parseFunctionTableOperand(std::unique_ptr<WebAssemblyOperand> *Op) {
-    if (STI->checkFeatures("+reference-types")) {
-      // If the reference-types feature is enabled, there is an explicit table
-      // operand.  To allow the same assembly to be compiled with or without
-      // reference types, we allow the operand to be omitted, in which case we
-      // default to __indirect_function_table.
+    if (STI->checkFeatures("+call-indirect-overlong") ||
+        STI->checkFeatures("+reference-types")) {
+      // If the call-indirect-overlong feature is enabled, or implied by the
+      // reference-types feature, there is an explicit table operand.  To allow
+      // the same assembly to be compiled with or without
+      // call-indirect-overlong, we allow the operand to be omitted, in which
+      // case we default to __indirect_function_table.
       auto &Tok = Lexer.getTok();
       if (Tok.is(AsmToken::Identifier)) {
         auto *Sym =
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 88628f2a79354..13603f8181198 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -29,6 +29,14 @@ def FeatureBulkMemory :
       SubtargetFeature<"bulk-memory", "HasBulkMemory", "true",
                        "Enable bulk memory operations">;
 
+def FeatureBulkMemoryOpt :
+      SubtargetFeature<"bulk-memory-opt", "HasBulkMemoryOpt", "true",
+                       "Enable bulk memory optimization operations">;
+
+def FeatureCallIndirectOverlong :
+      SubtargetFeature<"call-indirect-overlong", "HasCallIndirectOverlong", "true",
+                       "Enable overlong encoding for call_indirect immediates">;
+
 def FeatureExceptionHandling :
       SubtargetFeature<"exception-handling", "HasExceptionHandling", "true",
                        "Enable Wasm exception handling">;
@@ -114,15 +122,23 @@ def : ProcessorModel<"mvp", NoSchedModel, []>;
 // consideration given to available support in relevant engines and tools, and
 // the importance of the features.
 def : ProcessorModel<"generic", NoSchedModel,
-                      [FeatureBulkMemory, FeatureMultivalue,
+                      [FeatureBulkMemory, FeatureBulkMemoryOpt,
+                       FeatureCallIndirectOverlong, FeatureMultivalue,
                        FeatureMutableGlobals, FeatureNontrappingFPToInt,
                        FeatureReferenceTypes, FeatureSignExt]>;
 
+// Lime1: <https://github.com/WebAssembly/tool-conventions/blob/main/Lime.md#lime1>
+def : ProcessorModel<"lime1", NoSchedModel,
+                      [FeatureBulkMemoryOpt, FeatureCallIndirectOverlong,
+                       FeatureExtendedConst, FeatureMultivalue,
+                       FeatureMutableGlobals, FeatureNontrappingFPToInt,
+                       FeatureSignExt]>;
+
 // Latest and greatest experimental version of WebAssembly. Bugs included!
 def : ProcessorModel<"bleeding-edge", NoSchedModel,
-                      [FeatureAtomics, FeatureBulkMemory,
-                       FeatureExceptionHandling, FeatureExtendedConst,
-                       FeatureFP16, FeatureMultiMemory,
+                      [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
+                       FeatureCallIndirectOverlong, FeatureExceptionHandling,
+                       FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
                        FeatureMultivalue, FeatureMutableGlobals,
                        FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
                        FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 558aaa38096f7..210a35e1462ac 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -895,7 +895,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     // The table into which this call_indirect indexes.
     MCSymbolWasm *Table = WebAssembly::getOrCreateFunctionTableSymbol(
         MF->getContext(), Subtarget);
-    if (Subtarget->hasReferenceTypes()) {
+    if (Subtarget->hasCallIndirectOverlong()) {
       MIB.addSym(Table);
     } else {
       // Otherwise for the MVP there is at most one table whose number is 0, but
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 94b49387b58f9..c765d2b1ab95b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -768,7 +768,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
                                     MF.getContext(), Subtarget)
                               : WebAssembly::getOrCreateFunctionTableSymbol(
                                     MF.getContext(), Subtarget);
-    if (Subtarget->hasReferenceTypes()) {
+    if (Subtarget->hasCallIndirectOverlong()) {
       MIB.addSym(Table);
     } else {
       // For the MVP there is at most one table whose number is 0, but we can't
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index 0772afb039f82..79d6f21517e5d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -11,13 +11,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
-// Instruction requiring HasBulkMemory and the bulk memory prefix byte
+// Instruction requiring HasBulkMemoryOpt and the bulk memory prefix byte
 multiclass BULK_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
                   string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !or(0xfc00, !and(0xff, simdop))>,
-            Requires<[HasBulkMemory]>;
+            Requires<[HasBulkMemoryOpt]>;
 }
 
 // Bespoke types and nodes for bulk memory ops
@@ -89,14 +89,14 @@ defm CPY_A#B : I<(outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx,
                    rc:$dst, rc:$src, rc:$len
                  )],
                  "", "", 0>,
-                  Requires<[HasBulkMemory]>;
+                  Requires<[HasBulkMemoryOpt]>;
 
 let usesCustomInserter = 1, isCodeGenOnly = 1, mayStore = 1 in
 defm SET_A#B : I<(outs), (ins i32imm_op:$idx, rc:$dst, I32:$value, rc:$size),
                  (outs), (ins i32imm_op:$idx),
                  [(wasm_memset (i32 imm:$idx), rc:$dst, I32:$value, rc:$size)],
                  "", "", 0>,
-                 Requires<[HasBulkMemory]>;
+                 Requires<[HasBulkMemoryOpt]>;
 
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index b3ea499c4f915..415e802951a94 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -30,6 +30,14 @@ def HasBulkMemory :
     Predicate<"Subtarget->hasBulkMemory()">,
     AssemblerPredicate<(all_of FeatureBulkMemory), "bulk-memory">;
 
+def HasBulkMemoryOpt :
+    Predicate<"Subtarget->hasBulkMemoryOpt()">,
+    AssemblerPredicate<(all_of FeatureBulkMemoryOpt), "bulk-memory-opt">;
+
+def HasCallIndirectOverlong :
+    Predicate<"Subtarget->hasCallIndirectOverlong()">,
+    AssemblerPredicate<(all_of FeatureCallIndirectOverlong), "call-indirect-overlong">;
+
 def HasExceptionHandling :
     Predicate<"Subtarget->hasExceptionHandling()">,
     AssemblerPredicate<(all_of FeatureExceptionHandling), "exception-handling">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index d51bfeb6d8592..6f37dab409534 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -23,7 +23,7 @@ SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
     SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
-  if (!ST.hasBulkMemory())
+  if (!ST.hasBulkMemoryOpt())
     return SDValue();
 
   SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
@@ -51,7 +51,7 @@ SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
     SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
-  if (!ST.hasBulkMemory())
+  if (!ST.hasBulkMemoryOpt())
     return SDValue();
 
   SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 912f61765579f..40ea48ab3ac48 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -34,6 +34,24 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
     CPU = "generic";
 
   ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
+
+  FeatureBitset Bits = getFeatureBits();
+
+  // bulk-memory implies bulk-memory-opt
+  if (HasBulkMemory) {
+    HasBulkMemoryOpt = true;
+    Bits.set(WebAssembly::FeatureBulkMemoryOpt);
+  }
+
+  // reference-types implies call-indirect-overlong
+  if (HasReferenceTypes) {
+    HasCallIndirectOverlong = true;
+    Bits.set(WebAssembly::FeatureCallIndirectOverlong);
+  }
+
+  // In case we changed any bits, update `MCSubtargetInfo`'s `FeatureBitset`.
+  setFeatureBits(Bits);
+
   return *this;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index f2bf2902f775b..591ce25611e3e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -41,6 +41,8 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
 
   bool HasAtomics = false;
   bool HasBulkMemory = false;
+  bool HasBulkMemoryOpt = false;
+  bool HasCallIndirectOverlong = false;
   bool HasExceptionHandling = false;
   bool HasExtendedConst = false;
   bool HasFP16 = false;
@@ -95,6 +97,8 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
   bool hasAtomics() const { return HasAtomics; }
   bool hasBulkMemory() const { return HasBulkMemory; }
+  bool hasBulkMemoryOpt() const { return HasBulkMemoryOpt; }
+  bool hasCallIndirectOverlong() const { return HasCallIndirectOverlong; }
   bool hasExceptionHandling() const { return HasExceptionHandling; }
   bool hasExtendedConst() const { return HasExtendedConst; }
   bool hasFP16() const { return HasFP16; }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index f950a915db96f..6cfc93ef1faae 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -116,7 +116,7 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol(
     Sym->setUndefined();
   }
   // MVP object files can't have symtab entries for tables.
-  if (!(Subtarget && Subtarget->hasReferenceTypes()))
+  if (!(Subtarget && Subtarget->hasCallIndirectOverlong()))
     Sym->setOmitFromLinkingSection();
   return Sym;
 }
@@ -141,7 +141,7 @@ MCSymbolWasm *WebAssembly::getOrCreateFuncrefCallTableSymbol(
     Sym->setTableType(TableType);
   }
   // MVP object files can't have symtab entries for tables.
-  if (!(Subtarget && Subtarget->hasReferenceTypes()))
+  if (!(Subtarget && Subtarget->hasCallIndirectOverlong()))
     Sym->setOmitFromLinkingSection();
   return Sym;
 }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 4457e481def10..cd5a678d161b7 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -815,7 +815,7 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
 
     // Gracemont
     case 0xbe:
-      CPU = "gracement";
+      CPU = "gracemont";
       *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_ALDERLAKE;
       break;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 9cca3cdc76145..fde43bb354e83 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -430,10 +430,18 @@ class GlobalsImporter final {
         // than as part of the logic deciding which functions to import (i.e.
         // based on profile information). Should we decide to handle them here,
         // we can refactor accordingly at that time.
-        if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
+        bool CanImportDecl = false;
+        if (!GVS ||
             shouldSkipLocalInAnotherModule(GVS, VI.getSummaryList().size(),
-                                           Summary.modulePath()))
+                                           Summary.modulePath()) ||
+            !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true,
+                                      CanImportDecl)) {
+          if (ImportDeclaration && CanImportDecl)
+            ImportList.maybeAddDeclaration(RefSummary->modulePath(),
+                                           VI.getGUID());
+
           continue;
+        }
 
         // If there isn't an entry for GUID, insert <GUID, Definition> pair.
         // Otherwise, definition should take precedence over declaration.
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 3381b5f77683b..16a80e9ebbeaa 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -946,11 +946,10 @@ OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
 
   // If there is a comparison against null, we will insert a global bool to
   // keep track of whether the global was initialized yet or not.
-  GlobalVariable *InitBool =
-    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
-                       GlobalValue::InternalLinkage,
-                       ConstantInt::getFalse(GV->getContext()),
-                       GV->getName()+".init", GV->getThreadLocalMode());
+  GlobalVariable *InitBool = new GlobalVariable(
+      Type::getInt1Ty(GV->getContext()), false, GlobalValue::InternalLinkage,
+      ConstantInt::getFalse(GV->getContext()), GV->getName() + ".init",
+      GV->getThreadLocalMode(), GV->getAddressSpace());
   bool InitBoolUsed = false;
 
   // Loop over all instruction uses of GV, processing them in turn.
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 87d2432803062..e3caefe70311b 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -2090,20 +2090,19 @@ bool LowerTypeTestsModule::lower() {
   };
   MapVector<StringRef, ExportedFunctionInfo> ExportedFunctions;
   if (ExportSummary) {
-    // A set of all functions that are address taken by a live global object.
-    DenseSet<GlobalValue::GUID> AddressTaken;
-    for (auto &I : *ExportSummary)
-      for (auto &GVS : I.second.SummaryList)
-        if (GVS->isLive())
-          for (const auto &Ref : GVS->refs()) {
-            AddressTaken.insert(Ref.getGUID());
-            for (auto &RefGVS : Ref.getSummaryList())
-              if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get()))
-                AddressTaken.insert(Alias->getAliaseeGUID());
-          }
-
     NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
     if (CfiFunctionsMD) {
+      // A set of all functions that are address taken by a live global object.
+      DenseSet<GlobalValue::GUID> AddressTaken;
+      for (auto &I : *ExportSummary)
+        for (auto &GVS : I.second.SummaryList)
+          if (GVS->isLive())
+            for (const auto &Ref : GVS->refs()) {
+              AddressTaken.insert(Ref.getGUID());
+              for (auto &RefGVS : Ref.getSummaryList())
+                if (auto Alias = dyn_cast<AliasSummary>(RefGVS.get()))
+                  AddressTaken.insert(Alias->getAliaseeGUID());
+            }
       for (auto *FuncMD : CfiFunctionsMD->operands()) {
         assert(FuncMD->getNumOperands() >= 2);
         StringRef FunctionName =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index fed21db393ed2..b486c5b0b6fad 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -631,7 +631,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base, GEPNoWrapFlags NW,
 /// We can look through PHIs, GEPs and casts in order to determine a common base
 /// between GEPLHS and RHS.
 static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
-                                              ICmpInst::Predicate Cond,
+                                              CmpPredicate Cond,
                                               const DataLayout &DL,
                                               InstCombiner &IC) {
   // FIXME: Support vector of pointers.
@@ -675,8 +675,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
 /// Fold comparisons between a GEP instruction and something else. At this point
 /// we know that the GEP is on the LHS of the comparison.
 Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                                           ICmpInst::Predicate Cond,
-                                           Instruction &I) {
+                                           CmpPredicate Cond, Instruction &I) {
   // Don't transform signed compares of GEPs into index compares. Even if the
   // GEP is inbounds, the final add of the base pointer can have signed overflow
   // and would change the result of the icmp.
@@ -690,12 +689,32 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   if (!isa<GetElementPtrInst>(RHS))
     RHS = RHS->stripPointerCasts();
 
+  auto CanFold = [Cond](GEPNoWrapFlags NW) {
+    if (ICmpInst::isEquality(Cond))
+      return true;
+
+    // Unsigned predicates can be folded if the GEPs have *any* nowrap flags.
+    assert(ICmpInst::isUnsigned(Cond));
+    return NW != GEPNoWrapFlags::none();
+  };
+
+  auto NewICmp = [Cond](GEPNoWrapFlags NW, Value *Op1, Value *Op2) {
+    if (!NW.hasNoUnsignedWrap()) {
+      // Convert signed to unsigned comparison.
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Op1, Op2);
+    }
+
+    auto *I = new ICmpInst(Cond, Op1, Op2);
+    I->setSameSign(NW.hasNoUnsignedSignedWrap());
+    return I;
+  };
+
   Value *PtrBase = GEPLHS->getOperand(0);
-  if (PtrBase == RHS && (GEPLHS->isInBounds() || ICmpInst::isEquality(Cond))) {
+  if (PtrBase == RHS && CanFold(GEPLHS->getNoWrapFlags())) {
     // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
     Value *Offset = EmitGEPOffset(GEPLHS);
-    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
-                        Constant::getNullValue(Offset->getType()));
+    return NewICmp(GEPLHS->getNoWrapFlags(), Offset,
+                   Constant::getNullValue(Offset->getType()));
   }
 
   if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
@@ -785,7 +804,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
     }
 
-    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
+    GEPNoWrapFlags NW = GEPLHS->getNoWrapFlags() & GEPRHS->getNoWrapFlags();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {
       // If the GEPs only differ by one index, compare it.
@@ -813,19 +832,18 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         return replaceInstUsesWith(I, // No comparison is needed here.
           ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
 
-      else if (NumDifferences == 1 && GEPsInBounds) {
+      else if (NumDifferences == 1 && CanFold(NW)) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
         Value *RHSV = GEPRHS->getOperand(DiffOperand);
-        // Make sure we do a signed comparison here.
-        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+        return NewICmp(NW, LHSV, RHSV);
       }
     }
 
-    if (GEPsInBounds || CmpInst::isEquality(Cond)) {
+    if (CanFold(NW)) {
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
       Value *L = EmitGEPOffset(GEPLHS, /*RewriteGEP=*/true);
       Value *R = EmitGEPOffset(GEPRHS, /*RewriteGEP=*/true);
-      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+      return NewICmp(NW, L, R);
     }
   }
 
@@ -912,7 +930,7 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) {
 
 /// Fold "icmp pred (X+C), X".
 Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
-                                                  ICmpInst::Predicate Pred) {
+                                                  CmpPredicate Pred) {
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
@@ -1760,6 +1778,17 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
   if (!match(And, m_And(m_Value(X), m_APInt(C2))))
     return nullptr;
 
+  // (and X, highmask) s> [0, ~highmask] --> X s> ~highmask
+  if (Cmp.getPredicate() == ICmpInst::ICMP_SGT && C1.ule(~*C2) &&
+      C2->isNegatedPowerOf2())
+    return new ICmpInst(ICmpInst::ICMP_SGT, X,
+                        ConstantInt::get(X->getType(), ~*C2));
+  // (and X, highmask) s< [1, -highmask] --> X s< -highmask
+  if (Cmp.getPredicate() == ICmpInst::ICMP_SLT && !C1.isSignMask() &&
+      (C1 - 1).ule(~*C2) && C2->isNegatedPowerOf2() && !C2->isSignMask())
+    return new ICmpInst(ICmpInst::ICMP_SLT, X,
+                        ConstantInt::get(X->getType(), -*C2));
+
   // Don't perform the following transforms if the AND has multiple uses
   if (!And->hasOneUse())
     return nullptr;
@@ -3949,8 +3978,8 @@ Instruction *InstCombinerImpl::foldICmpBinOpWithConstant(ICmpInst &Cmp,
 }
 
 static Instruction *
-foldICmpUSubSatOrUAddSatWithConstant(ICmpInst::Predicate Pred,
-                                     SaturatingInst *II, const APInt &C,
+foldICmpUSubSatOrUAddSatWithConstant(CmpPredicate Pred, SaturatingInst *II,
+                                     const APInt &C,
                                      InstCombiner::BuilderTy &Builder) {
   // This transform may end up producing more than one instruction for the
   // intrinsic, so limit it to one user of the intrinsic.
@@ -4034,7 +4063,7 @@ foldICmpUSubSatOrUAddSatWithConstant(ICmpInst::Predicate Pred,
 }
 
 static Instruction *
-foldICmpOfCmpIntrinsicWithConstant(ICmpInst::Predicate Pred, IntrinsicInst *I,
+foldICmpOfCmpIntrinsicWithConstant(CmpPredicate Pred, IntrinsicInst *I,
                                    const APInt &C,
                                    InstCombiner::BuilderTy &Builder) {
   std::optional<ICmpInst::Predicate> NewPredicate = std::nullopt;
@@ -4233,9 +4262,8 @@ Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
   return nullptr;
 }
 
-Instruction *InstCombinerImpl::foldSelectICmp(ICmpInst::Predicate Pred,
-                                              SelectInst *SI, Value *RHS,
-                                              const ICmpInst &I) {
+Instruction *InstCombinerImpl::foldSelectICmp(CmpPredicate Pred, SelectInst *SI,
+                                              Value *RHS, const ICmpInst &I) {
   // Try to fold the comparison into the select arms, which will cause the
   // select to be converted into a logical and/or.
   auto SimplifyOp = [&](Value *Op, bool SelectCondIsTrue) -> Value * {
@@ -4404,7 +4432,7 @@ static bool isMaskOrZero(const Value *V, bool Not, const SimplifyQuery &Q,
 /// The Mask can be a constant, too.
 /// For some predicates, the operands are commutative.
 /// For others, x can only be on a specific side.
-static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
+static Value *foldICmpWithLowBitMaskedVal(CmpPredicate Pred, Value *Op0,
                                           Value *Op1, const SimplifyQuery &Q,
                                           InstCombiner &IC) {
 
@@ -5515,8 +5543,7 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
 /// Fold icmp Pred min|max(X, Y), Z.
 Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
                                                   MinMaxIntrinsic *MinMax,
-                                                  Value *Z,
-                                                  ICmpInst::Predicate Pred) {
+                                                  Value *Z, CmpPredicate Pred) {
   Value *X = MinMax->getLHS();
   Value *Y = MinMax->getRHS();
   if (ICmpInst::isSigned(Pred) && !MinMax->isSigned())
@@ -6869,8 +6896,8 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
   return nullptr;
 }
 
-std::optional<std::pair<CmpInst::Predicate, Constant *>>
-InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+std::optional<std::pair<CmpPredicate, Constant *>>
+InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred,
                                                        Constant *C) {
   assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
          "Only for relational integer predicates.");
@@ -7276,7 +7303,7 @@ static Instruction *foldReductionIdiom(ICmpInst &I,
 }
 
 // This helper will be called with icmp operands in both orders.
-Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred,
+Instruction *InstCombinerImpl::foldICmpCommutative(CmpPredicate Pred,
                                                    Value *Op0, Value *Op1,
                                                    ICmpInst &CxtI) {
   // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
@@ -7404,7 +7431,7 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V = simplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
+  if (Value *V = simplifyICmpInst(I.getCmpPredicate(), Op0, Op1, Q))
     return replaceInstUsesWith(I, V);
 
   // Comparing -val or val with non-zero is the same as just comparing val
@@ -7511,10 +7538,10 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
-  if (Instruction *Res = foldICmpCommutative(I.getPredicate(), Op0, Op1, I))
+  if (Instruction *Res = foldICmpCommutative(I.getCmpPredicate(), Op0, Op1, I))
     return Res;
   if (Instruction *Res =
-          foldICmpCommutative(I.getSwappedPredicate(), Op1, Op0, I))
+          foldICmpCommutative(I.getSwappedCmpPredicate(), Op1, Op0, I))
     return Res;
 
   if (I.isCommutative()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 0508ed48fc19c..28474fec8238e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -652,10 +652,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// folded operation.
   void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
 
-  Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                           ICmpInst::Predicate Cond, Instruction &I);
-  Instruction *foldSelectICmp(ICmpInst::Predicate Pred, SelectInst *SI,
-                              Value *RHS, const ICmpInst &I);
+  Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, CmpPredicate Cond,
+                           Instruction &I);
+  Instruction *foldSelectICmp(CmpPredicate Pred, SelectInst *SI, Value *RHS,
+                              const ICmpInst &I);
   bool foldAllocaCmp(AllocaInst *Alloca);
   Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI,
                                             GetElementPtrInst *GEP,
@@ -663,8 +663,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                             ConstantInt *AndCst = nullptr);
   Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
                                     Constant *RHSC);
-  Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
-                                  ICmpInst::Predicate Pred);
+  Instruction *foldICmpAddOpConst(Value *X, const APInt &C, CmpPredicate Pred);
   Instruction *foldICmpWithCastOp(ICmpInst &ICmp);
   Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp);
 
@@ -678,7 +677,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                                    const APInt &C);
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax,
-                                  Value *Z, ICmpInst::Predicate Pred);
+                                  Value *Z, CmpPredicate Pred);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
   Instruction *foldSignBitTest(ICmpInst &I);
@@ -736,8 +735,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                                const APInt &C);
   Instruction *foldICmpBitCast(ICmpInst &Cmp);
   Instruction *foldICmpWithTrunc(ICmpInst &Cmp);
-  Instruction *foldICmpCommutative(ICmpInst::Predicate Pred, Value *Op0,
-                                   Value *Op1, ICmpInst &CxtI);
+  Instruction *foldICmpCommutative(CmpPredicate Pred, Value *Op0, Value *Op1,
+                                   ICmpInst &CxtI);
 
   // Helpers of visitSelectInst().
   Instruction *foldSelectOfBools(SelectInst &SI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index dde35fe3f69dd..c7a0c35d099cc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3897,17 +3897,27 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (SIFPOp) {
     // TODO: Try to forward-propagate FMF from select arms to the select.
 
+    auto *FCmp = dyn_cast<FCmpInst>(CondVal);
+
     // Canonicalize select of FP values where NaN and -0.0 are not valid as
     // minnum/maxnum intrinsics.
     if (SIFPOp->hasNoNaNs() && SIFPOp->hasNoSignedZeros()) {
       Value *X, *Y;
-      if (match(&SI, m_OrdOrUnordFMax(m_Value(X), m_Value(Y))))
-        return replaceInstUsesWith(
-            SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI));
+      if (match(&SI, m_OrdOrUnordFMax(m_Value(X), m_Value(Y)))) {
+        Value *BinIntr =
+            Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI);
+        if (auto *BinIntrInst = dyn_cast<Instruction>(BinIntr))
+          BinIntrInst->setHasNoNaNs(FCmp->hasNoNaNs());
+        return replaceInstUsesWith(SI, BinIntr);
+      }
 
-      if (match(&SI, m_OrdOrUnordFMin(m_Value(X), m_Value(Y))))
-        return replaceInstUsesWith(
-            SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI));
+      if (match(&SI, m_OrdOrUnordFMin(m_Value(X), m_Value(Y)))) {
+        Value *BinIntr =
+            Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI);
+        if (auto *BinIntrInst = dyn_cast<Instruction>(BinIntr))
+          BinIntrInst->setHasNoNaNs(FCmp->hasNoNaNs());
+        return replaceInstUsesWith(SI, BinIntr);
+      }
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 392c5c78345c2..943598a30f040 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -776,6 +776,15 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
             return InsertNewInstWith(Shl, I->getIterator());
           }
         }
+
+        const APInt *Factor;
+        if (match(I->getOperand(0),
+                  m_OneUse(m_Mul(m_Value(X), m_APInt(Factor)))) &&
+            Factor->countr_zero() >= ShiftAmt) {
+          BinaryOperator *Mul = BinaryOperator::CreateMul(
+              X, ConstantInt::get(X->getType(), Factor->lshr(ShiftAmt)));
+          return InsertNewInstWith(Mul, I->getIterator());
+        }
       }
 
       // Unsigned shift right.
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 32f2a30afad48..3325a1868ebde 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1753,9 +1753,9 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
   if (TerminatorBI && TerminatorBI->isConditional() &&
       TerminatorBI->getSuccessor(0) != TerminatorBI->getSuccessor(1) && ICmp) {
     bool LHSIsTrue = TerminatorBI->getSuccessor(0) == PN->getParent();
-    std::optional<bool> ImpliedCond =
-        isImpliedCondition(TerminatorBI->getCondition(), ICmp->getPredicate(),
-                           Ops[0], Ops[1], DL, LHSIsTrue);
+    std::optional<bool> ImpliedCond = isImpliedCondition(
+        TerminatorBI->getCondition(), ICmp->getCmpPredicate(), Ops[0], Ops[1],
+        DL, LHSIsTrue);
     if (ImpliedCond)
       return ConstantInt::getBool(I.getType(), ImpliedCond.value());
   }
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 2ea89be40a3d4..f9be7f933d31e 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -820,7 +820,8 @@ bool GCOVProfiler::emitProfileNotes(
       SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI,
                                    BFI);
 
-      CFGMST<Edge, BBInfo> MST(F, /*InstrumentFuncEntry_=*/false, BPI, BFI);
+      CFGMST<Edge, BBInfo> MST(F, /*InstrumentFuncEntry=*/false,
+                               /*InstrumentLoopEntries=*/false, BPI, BFI);
 
       // getInstrBB can split basic blocks and push elements to AllEdges.
       for (size_t I : llvm::seq<size_t>(0, MST.numEdges())) {
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index dca6bf1adfde8..934500509873f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -6159,8 +6159,10 @@ struct VarArgGenericHelper : public VarArgHelperBase {
     unsigned VAArgOffset = 0;
     const DataLayout &DL = F.getDataLayout();
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
-    for (Value *A :
-         llvm::drop_begin(CB.args(), CB.getFunctionType()->getNumParams())) {
+    for (const auto &[ArgNo, A] : llvm::enumerate(CB.args())) {
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      if (IsFixed)
+        continue;
       uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
       if (DL.isBigEndian()) {
         // Adjusting the shadow for argument with size < IntptrSize to match the
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 4d8141431a0c1..471086ce3a751 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -259,6 +259,11 @@ static cl::opt<bool> PGOInstrumentEntry(
     "pgo-instrument-entry", cl::init(false), cl::Hidden,
     cl::desc("Force to instrument function entry basicblock."));
 
+static cl::opt<bool>
+    PGOInstrumentLoopEntries("pgo-instrument-loop-entries", cl::init(false),
+                             cl::Hidden,
+                             cl::desc("Force to instrument loop entries."));
+
 static cl::opt<bool> PGOFunctionEntryCoverage(
     "pgo-function-entry-coverage", cl::Hidden,
     cl::desc(
@@ -359,6 +364,7 @@ class FunctionInstrumenter final {
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
   BranchProbabilityInfo *const BPI;
   BlockFrequencyInfo *const BFI;
+  LoopInfo *const LI;
 
   const PGOInstrumentationType InstrumentationType;
 
@@ -376,14 +382,17 @@ class FunctionInstrumenter final {
            InstrumentationType == PGOInstrumentationType::CTXPROF;
   }
 
+  bool shouldInstrumentLoopEntries() const { return PGOInstrumentLoopEntries; }
+
 public:
   FunctionInstrumenter(
       Module &M, Function &F, TargetLibraryInfo &TLI,
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr,
+      LoopInfo *LI = nullptr,
       PGOInstrumentationType InstrumentationType = PGOInstrumentationType::FDO)
       : M(M), F(F), TLI(TLI), ComdatMembers(ComdatMembers), BPI(BPI), BFI(BFI),
-        InstrumentationType(InstrumentationType) {}
+        LI(LI), InstrumentationType(InstrumentationType) {}
 
   void instrument();
 };
@@ -439,6 +448,8 @@ createIRLevelProfileFlagVar(Module &M,
   if (PGOInstrumentEntry ||
       InstrumentationType == PGOInstrumentationType::CTXPROF)
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
+  if (PGOInstrumentLoopEntries)
+    ProfileVersion |= VARIANT_MASK_INSTR_LOOP_ENTRIES;
   if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
   if (PGOFunctionEntryCoverage)
@@ -625,12 +636,13 @@ template <class Edge, class BBInfo> class FuncPGOInstrumentation {
       Function &Func, TargetLibraryInfo &TLI,
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
-      BlockFrequencyInfo *BFI = nullptr, bool IsCS = false,
-      bool InstrumentFuncEntry = true, bool HasSingleByteCoverage = false)
+      BlockFrequencyInfo *BFI = nullptr, LoopInfo *LI = nullptr,
+      bool IsCS = false, bool InstrumentFuncEntry = true,
+      bool InstrumentLoopEntries = false, bool HasSingleByteCoverage = false)
       : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
         TLI(TLI), ValueSites(IPVK_Last + 1),
         SIVisitor(Func, HasSingleByteCoverage),
-        MST(F, InstrumentFuncEntry, BPI, BFI),
+        MST(F, InstrumentFuncEntry, InstrumentLoopEntries, BPI, BFI, LI),
         BCI(constructBCI(Func, HasSingleByteCoverage, InstrumentFuncEntry)) {
     if (BCI && PGOViewBlockCoverageGraph)
       BCI->viewBlockCoverageGraph();
@@ -916,9 +928,10 @@ void FunctionInstrumenter::instrument() {
 
   const bool IsCtxProf = InstrumentationType == PGOInstrumentationType::CTXPROF;
   FuncPGOInstrumentation<PGOEdge, PGOBBInfo> FuncInfo(
-      F, TLI, ComdatMembers, /*CreateGlobalVar=*/!IsCtxProf, BPI, BFI,
+      F, TLI, ComdatMembers, /*CreateGlobalVar=*/!IsCtxProf, BPI, BFI, LI,
       InstrumentationType == PGOInstrumentationType::CSFDO,
-      shouldInstrumentEntryBB(), PGOBlockCoverage);
+      shouldInstrumentEntryBB(), shouldInstrumentLoopEntries(),
+      PGOBlockCoverage);
 
   auto *const Name = IsCtxProf ? cast<GlobalValue>(&F) : FuncInfo.FuncNameVar;
   auto *const CFGHash =
@@ -1136,11 +1149,13 @@ class PGOUseFunc {
   PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
              BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
-             ProfileSummaryInfo *PSI, bool IsCS, bool InstrumentFuncEntry,
+             LoopInfo *LI, ProfileSummaryInfo *PSI, bool IsCS,
+             bool InstrumentFuncEntry, bool InstrumentLoopEntries,
              bool HasSingleByteCoverage)
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
-        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
-                 InstrumentFuncEntry, HasSingleByteCoverage),
+        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, LI, IsCS,
+                 InstrumentFuncEntry, InstrumentLoopEntries,
+                 HasSingleByteCoverage),
         FreqAttr(FFA_Normal), IsCS(IsCS), VPC(Func, TLI) {}
 
   void handleInstrProfError(Error Err, uint64_t MismatchedFuncSum);
@@ -1923,6 +1938,7 @@ static bool InstrumentAllFunctions(
     Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+    function_ref<LoopInfo *(Function &)> LookupLI,
     PGOInstrumentationType InstrumentationType) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
@@ -1943,10 +1959,11 @@ static bool InstrumentAllFunctions(
   for (auto &F : M) {
     if (skipPGOGen(F))
       continue;
-    auto &TLI = LookupTLI(F);
-    auto *BPI = LookupBPI(F);
-    auto *BFI = LookupBFI(F);
-    FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI,
+    TargetLibraryInfo &TLI = LookupTLI(F);
+    BranchProbabilityInfo *BPI = LookupBPI(F);
+    BlockFrequencyInfo *BFI = LookupBFI(F);
+    LoopInfo *LI = LookupLI(F);
+    FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI, LI,
                             InstrumentationType);
     FI.instrument();
   }
@@ -1980,8 +1997,11 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
+  auto LookupLI = [&FAM](Function &F) {
+    return &FAM.getResult<LoopAnalysis>(F);
+  };
 
-  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI,
+  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, LookupLI,
                               InstrumentationType))
     return PreservedAnalyses::all();
 
@@ -2116,7 +2136,8 @@ static bool annotateAllFunctions(
     function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
-    ProfileSummaryInfo *PSI, bool IsCS) {
+    function_ref<LoopInfo *(Function &)> LookupLI, ProfileSummaryInfo *PSI,
+    bool IsCS) {
   LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
@@ -2181,22 +2202,27 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
+  bool InstrumentLoopEntries = PGOReader->instrLoopEntriesEnabled();
+  if (PGOInstrumentLoopEntries.getNumOccurrences() > 0)
+    InstrumentLoopEntries = PGOInstrumentLoopEntries;
 
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
     if (skipPGOUse(F))
       continue;
-    auto &TLI = LookupTLI(F);
-    auto *BPI = LookupBPI(F);
-    auto *BFI = LookupBFI(F);
+    TargetLibraryInfo &TLI = LookupTLI(F);
+    BranchProbabilityInfo *BPI = LookupBPI(F);
+    BlockFrequencyInfo *BFI = LookupBFI(F);
+    LoopInfo *LI = LookupLI(F);
     if (!HasSingleByteCoverage) {
       // Split indirectbr critical edges here before computing the MST rather
       // than later in getInstrBB() to avoid invalidating it.
       SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI,
                                    BFI);
     }
-    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
-                    InstrumentFuncEntry, HasSingleByteCoverage);
+    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, LI, PSI, IsCS,
+                    InstrumentFuncEntry, InstrumentLoopEntries,
+                    HasSingleByteCoverage);
     if (HasSingleByteCoverage) {
       Func.populateCoverage(PGOReader.get());
       continue;
@@ -2335,10 +2361,14 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
+  auto LookupLI = [&FAM](Function &F) {
+    return &FAM.getResult<LoopAnalysis>(F);
+  };
 
   auto *PSI = &MAM.getResult<ProfileSummaryAnalysis>(M);
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, *FS,
-                            LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
+                            LookupTLI, LookupBPI, LookupBFI, LookupLI, PSI,
+                            IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 4884c23f16e12..944be38cb94bc 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -387,12 +387,12 @@ struct OffsetResult {
   Value *BasePtr;
   APInt ConstantOffset;
   SmallMapVector<Value *, APInt, 4> VariableOffsets;
-  bool AllInbounds;
+  GEPNoWrapFlags NW;
 
   OffsetResult() : BasePtr(nullptr), ConstantOffset(0, uint64_t(0)) {}
 
   OffsetResult(GEPOperator &GEP, const DataLayout &DL)
-      : BasePtr(GEP.getPointerOperand()), AllInbounds(GEP.isInBounds()) {
+      : BasePtr(GEP.getPointerOperand()), NW(GEP.getNoWrapFlags()) {
     ConstantOffset = APInt(DL.getIndexTypeSizeInBits(BasePtr->getType()), 0);
   }
 };
@@ -426,7 +426,7 @@ static OffsetResult collectOffsets(GEPOperator &GEP, const DataLayout &DL) {
     Result.ConstantOffset += ConstantOffset2;
     if (Result.VariableOffsets.size() == 0 && VariableOffsets2.size() == 1)
       Result.VariableOffsets = VariableOffsets2;
-    Result.AllInbounds &= InnerGEP->isInBounds();
+    Result.NW &= InnerGEP->getNoWrapFlags();
   }
   return Result;
 }
@@ -450,9 +450,9 @@ static Decomposition decomposeGEP(GEPOperator &GEP,
 
   assert(!IsSigned && "The logic below only supports decomposition for "
                       "unsigned predicates at the moment.");
-  const auto &[BasePtr, ConstantOffset, VariableOffsets, AllInbounds] =
+  const auto &[BasePtr, ConstantOffset, VariableOffsets, NW] =
       collectOffsets(GEP, DL);
-  if (!BasePtr || !AllInbounds)
+  if (!BasePtr || !NW.hasNoUnsignedSignedWrap())
     return &GEP;
 
   Decomposition Result(ConstantOffset.getSExtValue(), DecompEntry(1, BasePtr));
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index e9e1071ea210c..0cba5d077da62 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -68,6 +68,7 @@ static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
     cl::desc("Enable memcpyopt even when libcalls are disabled"));
 
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemMoveInstr, "Number of memmove instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
@@ -1841,12 +1842,75 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   return false;
 }
 
+/// Memmove calls with overlapping src/dest buffers that come after a memset may
+/// be removed.
+bool MemCpyOptPass::isMemMoveMemSetDependency(MemMoveInst *M) {
+  const auto &DL = M->getDataLayout();
+  MemoryUseOrDef *MemMoveAccess = MSSA->getMemoryAccess(M);
+  if (!MemMoveAccess)
+    return false;
+
+  // The memmove is of form memmove(x, x + A, B).
+  MemoryLocation SourceLoc = MemoryLocation::getForSource(M);
+  auto *MemMoveSourceOp = M->getSource();
+  auto *Source = dyn_cast<GEPOperator>(MemMoveSourceOp);
+  if (!Source)
+    return false;
+
+  APInt Offset(DL.getIndexTypeSizeInBits(Source->getType()), 0);
+  LocationSize MemMoveLocSize = SourceLoc.Size;
+  if (Source->getPointerOperand() != M->getDest() ||
+      !MemMoveLocSize.hasValue() ||
+      !Source->accumulateConstantOffset(DL, Offset) || Offset.isNegative()) {
+    return false;
+  }
+
+  uint64_t MemMoveSize = MemMoveLocSize.getValue();
+  LocationSize TotalSize =
+      LocationSize::precise(Offset.getZExtValue() + MemMoveSize);
+  MemoryLocation CombinedLoc(M->getDest(), TotalSize);
+
+  // The first dominating clobbering MemoryAccess for the combined location
+  // needs to be a memset.
+  BatchAAResults BAA(*AA);
+  MemoryAccess *FirstDef = MemMoveAccess->getDefiningAccess();
+  auto *DestClobber = dyn_cast<MemoryDef>(
+      MSSA->getWalker()->getClobberingMemoryAccess(FirstDef, CombinedLoc, BAA));
+  if (!DestClobber)
+    return false;
+
+  auto *MS = dyn_cast_or_null<MemSetInst>(DestClobber->getMemoryInst());
+  if (!MS)
+    return false;
+
+  // Memset length must be sufficiently large.
+  auto *MemSetLength = dyn_cast<ConstantInt>(MS->getLength());
+  if (!MemSetLength || MemSetLength->getZExtValue() < MemMoveSize)
+    return false;
+
+  // The destination buffer must have been memset'd.
+  if (!BAA.isMustAlias(MS->getDest(), M->getDest()))
+    return false;
+
+  return true;
+}
+
 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
 /// not to alias.
-bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+bool MemCpyOptPass::processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI) {
   // See if the source could be modified by this memmove potentially.
-  if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M))))
+  if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M)))) {
+    // On the off-chance the memmove clobbers src with previously memset'd
+    // bytes, the memmove may be redundant.
+    if (!M->isVolatile() && isMemMoveMemSetDependency(M)) {
+      LLVM_DEBUG(dbgs() << "Removed redundant memmove.\n");
+      ++BBI;
+      eraseInstruction(M);
+      ++NumMemMoveInstr;
+      return true;
+    }
     return false;
+  }
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
                     << "\n");
@@ -2064,7 +2128,7 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
       else if (auto *M = dyn_cast<MemCpyInst>(I))
         RepeatInstruction = processMemCpy(M, BI);
       else if (auto *M = dyn_cast<MemMoveInst>(I))
-        RepeatInstruction = processMemMove(M);
+        RepeatInstruction = processMemMove(M, BI);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
         for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) {
           if (CB->isByValArgument(i))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 90312c1a28df3..3c7c044a04271 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7721,6 +7721,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.prepareToExecute(ILV.getTripCount(),
                              ILV.getOrCreateVectorTripCount(nullptr),
                              CanonicalIVStartValue, State);
+  VPlanTransforms::prepareToExecute(BestVPlan);
 
   BestVPlan.execute(&State);
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0475510264336..33657c26356d6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6073,6 +6073,23 @@ void BoUpSLP::reorderTopToBottom() {
                                      TE->Scalars.size();
                         }) &&
                  "All users must be of VF size.");
+          if (SLPReVec) {
+            assert(SLPReVec && "Only supported by REVEC.");
+            // ShuffleVectorInst does not do reorderOperands (and it should not
+            // because ShuffleVectorInst supports only a limited set of
+            // patterns). Only do reorderNodeWithReuses if all of the users are
+            // not ShuffleVectorInst.
+            if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+                  return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
+                }))
+              continue;
+            assert(none_of(TE->UserTreeIndices,
+                           [&](const EdgeInfo &EI) {
+                             return isa<ShuffleVectorInst>(
+                                 EI.UserTE->getMainOp());
+                           }) &&
+                   "Does not know how to reorder.");
+          }
           // Update ordering of the operands with the smaller VF than the given
           // one.
           reorderNodeWithReuses(*TE, Mask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 529108a5aaa97..b801d1863e252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1070,10 +1070,9 @@ void VPlan::execute(VPTransformState *State) {
     }
 
     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
-    bool NeedsScalar =
-        isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
-        (isa<VPReductionPHIRecipe>(PhiR) &&
-         cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+    bool NeedsScalar = isa<VPScalarPHIRecipe>(PhiR) ||
+                       (isa<VPReductionPHIRecipe>(PhiR) &&
+                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     Value *Phi = State->get(PhiR, NeedsScalar);
     Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
     cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1b1630ebc6c23..e1d828f038f9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2239,6 +2239,45 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
 #endif
 };
 
+/// Recipe to generate a scalar PHI. Used to generate code for recipes that
+/// produce scalar header phis, including VPCanonicalIVPHIRecipe and
+/// VPEVLBasedIVPHIRecipe.
+class VPScalarPHIRecipe : public VPHeaderPHIRecipe {
+  std::string Name;
+
+public:
+  VPScalarPHIRecipe(VPValue *Start, VPValue *BackedgeValue, DebugLoc DL,
+                    StringRef Name)
+      : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, nullptr, Start, DL),
+        Name(Name.str()) {
+    addOperand(BackedgeValue);
+  }
+
+  ~VPScalarPHIRecipe() override = default;
+
+  VPScalarPHIRecipe *clone() override {
+    llvm_unreachable("cloning not implemented yet");
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPScalarPHISC)
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for handling phis that are widened in the vector loop.
 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
 /// managed in the recipe directly.
@@ -3134,8 +3173,10 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
     return D->getVPDefID() == VPDef::VPCanonicalIVPHISC;
   }
 
-  /// Generate the canonical scalar induction phi of the vector loop.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable(
+        "cannot execute this recipe, should be replaced by VPScalarPHIRecipe");
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -3231,9 +3272,10 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
     return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC;
   }
 
-  /// Generate phi for handling IV based on EVL over iterations correctly.
-  /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable(
+        "cannot execute this recipe, should be replaced by VPScalarPHIRecipe");
+  }
 
   /// Return the cost of this VPEVLBasedIVPHIRecipe.
   InstructionCost computeCost(ElementCount VF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index cb42cfe8159b0..969d07b229e46 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -213,14 +213,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
                 VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
-                VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
-              [this](const auto *R) {
-                // Handle header phi recipes, except VPWidenIntOrFpInduction
-                // which needs special handling due it being possibly truncated.
-                // TODO: consider inferring/caching type of siblings, e.g.,
-                // backedge value, here and in cases below.
-                return inferScalarType(R->getStartValue());
-              })
+                VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe,
+                VPScalarPHIRecipe>([this](const auto *R) {
+            // Handle header phi recipes, except VPWidenIntOrFpInduction
+            // which needs special handling due it being possibly truncated.
+            // TODO: consider inferring/caching type of siblings, e.g.,
+            // backedge value, here and in cases below.
+            return inferScalarType(R->getStartValue());
+          })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8a44b5b176c46..ef5f6e22f8220 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -550,7 +550,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
-    unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+    unsigned Op = RdxDesc.getOpcode();
     if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
       Op = Instruction::Or;
 
@@ -2130,8 +2130,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
           createOrderedReduction(State.Builder, RdxDesc, NewVecOp, PrevInChain);
     else
       NewRed = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
-          NewVecOp);
+          (Instruction::BinaryOps)RdxDesc.getOpcode(), PrevInChain, NewVecOp);
     PrevInChain = NewRed;
     NextInChain = NewRed;
   } else {
@@ -2142,7 +2141,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
                                    NewRed, PrevInChain);
     else
       NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
+          (Instruction::BinaryOps)RdxDesc.getOpcode(), NewRed, PrevInChain);
   }
   State.set(this, NextInChain, /*IsScalar*/ true);
 }
@@ -2179,8 +2178,8 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else
-      NewRed = Builder.CreateBinOp(
-          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);
+      NewRed = Builder.CreateBinOp((Instruction::BinaryOps)RdxDesc.getOpcode(),
+                                   NewRed, Prev);
   }
   State.set(this, NewRed, /*IsScalar*/ true);
 }
@@ -3102,17 +3101,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
                                            VectorTy, std::nullopt, CostKind, 0);
 }
 
-void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
-  Value *Start = getStartValue()->getLiveInIRValue();
-  PHINode *Phi = PHINode::Create(Start->getType(), 2, "index");
-  Phi->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
-
-  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
-  Phi->addIncoming(Start, VectorPH);
-  Phi->setDebugLoc(getDebugLoc());
-  State.set(this, Phi, /*IsScalar*/ true);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
@@ -3154,8 +3142,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
          "Recipe should have been replaced");
 
-  auto *IVR = getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
   unsigned CurrentPart = getUnrollPart(*this);
 
   // Build a pointer phi
@@ -3165,6 +3151,12 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   PHINode *NewPointerPhi = nullptr;
   if (CurrentPart == 0) {
+    auto *IVR = cast<VPHeaderPHIRecipe>(&getParent()
+                                             ->getPlan()
+                                             ->getVectorLoopRegion()
+                                             ->getEntryBasicBlock()
+                                             ->front());
+    PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, /*IsScalar*/ true));
     NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi",
                                     CanonicalIV->getIterator());
     NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
@@ -3478,20 +3470,30 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
+
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPScalarPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
   Value *Start = State.get(getOperand(0), VPLane(0));
-  PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");
+  PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, Name);
   Phi->addIncoming(Start, VectorPH);
   Phi->setDebugLoc(getDebugLoc());
   State.set(this, Phi, /*IsScalar=*/true);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
-                                  VPSlotTracker &SlotTracker) const {
-  O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
-
+void VPScalarPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << Indent << "SCALAR-PHI";
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1b333bdc30ff1..cee83d1015b53 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1819,3 +1819,24 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+void VPlanTransforms::prepareToExecute(VPlan &Plan) {
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getVectorLoopRegion());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan.getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
+      if (!isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(&R))
+        continue;
+      auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+      StringRef Name =
+          isa<VPCanonicalIVPHIRecipe>(PhiR) ? "index" : "evl.based.iv";
+      auto *ScalarR =
+          new VPScalarPHIRecipe(PhiR->getStartValue(), PhiR->getBackedgeValue(),
+                                PhiR->getDebugLoc(), Name);
+      ScalarR->insertBefore(PhiR);
+      PhiR->replaceAllUsesWith(ScalarR);
+      PhiR->eraseFromParent();
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 11e094db6294f..1491e0a8df04d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -123,6 +123,9 @@ struct VPlanTransforms {
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
+
+  /// Lower abstract recipes to concrete ones, that can be codegen'd.
+  static void prepareToExecute(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 691b0d40823cf..957a602091c73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -359,6 +359,7 @@ class VPDef {
     VPFirstOrderRecurrencePHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
+    VPScalarPHISC,
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 57a56c6a60415..40fdb14e81333 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -520,24 +520,6 @@ if(build_runtimes)
       endif()
     endforeach()
   endif()
-  if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
-      (LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD))
-    if(LIBC_HDRGEN_EXE)
-      set(hdrgen_exe ${LIBC_HDRGEN_EXE})
-    else()
-      if(TARGET ${LIBC_TABLEGEN_EXE})
-        set(hdrgen_exe $<TARGET_FILE:${LIBC_TABLEGEN_EXE}>)
-      else()
-        set(hdrgen_exe ${LIBC_TABLEGEN_EXE})
-      endif()
-      set(hdrgen_deps ${LIBC_TABLEGEN_TARGET})
-    endif()
-    if(NOT hdrgen_exe)
-      message(FATAL_ERROR "libc-hdrgen executable missing")
-    endif()
-    list(APPEND extra_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}")
-    list(APPEND extra_deps ${hdrgen_deps})
-  endif()
   if(LLVM_LIBC_GPU_BUILD)
     list(APPEND extra_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
     if("libc" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
index 0b2c8da4438da..9eb06a07f135f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
@@ -1,30 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define void @fabs() {
 ; CHECK-LABEL: 'fabs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call float @llvm.fabs.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call double @llvm.fabs.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.fabs.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 1 x float> @llvm.fabs.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 2 x float> @llvm.fabs.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 8 x float> @llvm.fabs.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 16 x float> @llvm.fabs.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call double @llvm.fabs.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %24 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 1 x double> @llvm.fabs.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 8 x double> @llvm.fabs.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.fabs.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.fabs.v16f16(<16 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.fabs.nxv16f16(<vscale x 16 x bfloat> undef)
   call float @llvm.fabs.f32(float undef)
   call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
   call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
@@ -53,11 +71,11 @@ define void @fabs_f16() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.fabs.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index be9c19dc59a85..446627f6bf3c0 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -8,30 +8,30 @@ define void @sqrt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 1 x float> @llvm.sqrt.nxv1f32(<vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 2 x float> @llvm.sqrt.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 8 x float> @llvm.sqrt.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 16 x float> @llvm.sqrt.nxv16f32(<vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call double @llvm.sqrt.f64(double undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %24 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 1 x double> @llvm.sqrt.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %27 = call <vscale x 4 x double> @llvm.sqrt.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call bfloat @llvm.sqrt.bf16(bfloat undef)
@@ -71,11 +71,11 @@ define void @sqrt_f16() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.sqrt.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
index 55db70ce1e912..5f2728f93d551 100644
--- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll
@@ -209,37 +209,37 @@ define void @ctlz() {
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.ctlz.nxv2i8(<vscale x 2 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.ctlz.nxv4i8(<vscale x 4 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.ctlz.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 32 x i8> @llvm.ctlz.nxv32i8(<vscale x 32 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 64 x i8> @llvm.ctlz.nxv64i8(<vscale x 64 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.ctlz.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 32 x i8> @llvm.ctlz.nxv32i8(<vscale x 32 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <vscale x 64 x i8> @llvm.ctlz.nxv64i8(<vscale x 64 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x i16> @llvm.ctlz.nxv1i16(<vscale x 1 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x i16> @llvm.ctlz.nxv2i16(<vscale x 2 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctlz.nxv4i16(<vscale x 4 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctlz.nxv8i16(<vscale x 8 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctlz.nxv16i16(<vscale x 16 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = call <vscale x 32 x i16> @llvm.ctlz.nxv32i16(<vscale x 32 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 1 x i32> @llvm.ctlz.nxv1i32(<vscale x 1 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 2 x i32> @llvm.ctlz.nxv2i32(<vscale x 2 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %33 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %34 = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = call <vscale x 16 x i64> @llvm.ctlz.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %36 = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = call <vscale x 16 x i64> @llvm.ctlz.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> undef, i1 false)
@@ -336,37 +336,37 @@ define void @cttz() {
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.cttz.nxv2i8(<vscale x 2 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.cttz.nxv4i8(<vscale x 4 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.cttz.nxv8i8(<vscale x 8 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 32 x i8> @llvm.cttz.nxv32i8(<vscale x 32 x i8> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 64 x i8> @llvm.cttz.nxv64i8(<vscale x 64 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 32 x i8> @llvm.cttz.nxv32i8(<vscale x 32 x i8> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <vscale x 64 x i8> @llvm.cttz.nxv64i8(<vscale x 64 x i8> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x i16> @llvm.cttz.nxv1i16(<vscale x 1 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x i16> @llvm.cttz.nxv2i16(<vscale x 2 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x i16> @llvm.cttz.nxv4i16(<vscale x 4 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 8 x i16> @llvm.cttz.nxv8i16(<vscale x 8 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = call <vscale x 16 x i16> @llvm.cttz.nxv16i16(<vscale x 16 x i16> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = call <vscale x 32 x i16> @llvm.cttz.nxv32i16(<vscale x 32 x i16> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 2 x i32> @llvm.cttz.nxv2i32(<vscale x 2 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %29 = call <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %30 = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %33 = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %34 = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> undef, i1 false)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = call <vscale x 16 x i64> @llvm.cttz.nxv16i64(<vscale x 16 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %36 = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> undef, i1 false)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = call <vscale x 16 x i64> @llvm.cttz.nxv16i64(<vscale x 16 x i64> undef, i1 false)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call <2 x i8> @llvm.cttz.v2i8(<2 x i8> undef, i1 false)
@@ -465,37 +465,37 @@ define void @ctpop() {
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x i8> @llvm.ctpop.nxv2i8(<vscale x 2 x i8> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x i8> @llvm.ctpop.nxv8i8(<vscale x 8 x i8> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <vscale x 16 x i8> @llvm.ctpop.nxv16i8(<vscale x 16 x i8> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call i16 @llvm.ctpop.i16(i16 undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x i16> @llvm.ctpop.nxv1i16(<vscale x 1 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x i16> @llvm.ctpop.nxv2i16(<vscale x 2 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x i16> @llvm.ctpop.nxv4i16(<vscale x 4 x i16> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 8 x i16> @llvm.ctpop.nxv8i16(<vscale x 8 x i16> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = call <vscale x 16 x i16> @llvm.ctpop.nxv16i16(<vscale x 16 x i16> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = call i32 @llvm.ctpop.i32(i32 undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 1 x i32> @llvm.ctpop.nxv1i32(<vscale x 1 x i32> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call <vscale x 4 x i32> @llvm.ctpop.nxv4i32(<vscale x 4 x i32> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %29 = call <vscale x 8 x i32> @llvm.ctpop.nxv8i32(<vscale x 8 x i32> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %30 = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call i64 @llvm.ctpop.i64(i64 undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <vscale x 1 x i64> @llvm.ctpop.nxv1i64(<vscale x 1 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> undef)
-; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = call <vscale x 16 x i64> @llvm.ctpop.nxv16i64(<vscale x 16 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %37 = call <vscale x 2 x i64> @llvm.ctpop.nxv2i64(<vscale x 2 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %38 = call <vscale x 4 x i64> @llvm.ctpop.nxv4i64(<vscale x 4 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %39 = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> undef)
+; ZVBB-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %40 = call <vscale x 16 x i64> @llvm.ctpop.nxv16i64(<vscale x 16 x i64> undef)
 ; ZVBB-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.ctpop.i8(i8 undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll b/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll
index be6b7c57d2252..0758eb204be48 100644
--- a/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll
@@ -11,33 +11,33 @@ define void @sadd.sat() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.sadd.sat.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.sadd.sat.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.sadd.sat.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 2 x i16> @llvm.sadd.sat.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 4 x i16> @llvm.sadd.sat.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.sadd.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 16 x i16> @llvm.sadd.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 2 x i32> @llvm.sadd.sat.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 8 x i32> @llvm.sadd.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.sadd.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 8 x i32> @llvm.sadd.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 16 x i32> @llvm.sadd.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <16 x i64> @llvm.sadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 4 x i64> @llvm.sadd.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.sadd.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %32 = call <16 x i64> @llvm.sadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = call <vscale x 4 x i64> @llvm.sadd.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = call <vscale x 8 x i64> @llvm.sadd.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
@@ -88,33 +88,33 @@ define void @uadd.sat() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.uadd.sat.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.uadd.sat.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.uadd.sat.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 2 x i16> @llvm.uadd.sat.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 4 x i16> @llvm.uadd.sat.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.uadd.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 16 x i16> @llvm.uadd.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 2 x i32> @llvm.uadd.sat.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 8 x i32> @llvm.uadd.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.uadd.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 8 x i32> @llvm.uadd.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 16 x i32> @llvm.uadd.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %28 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <16 x i64> @llvm.uadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 4 x i64> @llvm.uadd.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.uadd.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %32 = call <16 x i64> @llvm.uadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = call <vscale x 4 x i64> @llvm.uadd.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = call <vscale x 8 x i64> @llvm.uadd.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
@@ -165,33 +165,33 @@ define void @usub.sat() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.usub.sat.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.usub.sat.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.usub.sat.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 2 x i16> @llvm.usub.sat.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 4 x i16> @llvm.usub.sat.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.usub.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 16 x i16> @llvm.usub.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 2 x i32> @llvm.usub.sat.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 8 x i32> @llvm.usub.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.usub.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 8 x i32> @llvm.usub.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 16 x i32> @llvm.usub.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %28 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <16 x i64> @llvm.usub.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 4 x i64> @llvm.usub.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.usub.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %32 = call <16 x i64> @llvm.usub.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = call <vscale x 4 x i64> @llvm.usub.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = call <vscale x 8 x i64> @llvm.usub.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
@@ -242,33 +242,33 @@ define void @ssub.sat() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x i8> @llvm.ssub.sat.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x i8> @llvm.ssub.sat.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x i8> @llvm.ssub.sat.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %10 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 2 x i16> @llvm.ssub.sat.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 4 x i16> @llvm.ssub.sat.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.ssub.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 16 x i16> @llvm.ssub.sat.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %19 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 2 x i32> @llvm.ssub.sat.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 8 x i32> @llvm.ssub.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.ssub.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = call <vscale x 8 x i32> @llvm.ssub.sat.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %27 = call <vscale x 16 x i32> @llvm.ssub.sat.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %28 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <16 x i64> @llvm.ssub.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 4 x i64> @llvm.ssub.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.ssub.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %31 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %32 = call <16 x i64> @llvm.ssub.sat.v16i64(<16 x i64> undef, <16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = call <vscale x 4 x i64> @llvm.ssub.sat.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = call <vscale x 8 x i64> @llvm.ssub.sat.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
deleted file mode 100644
index 90b5b746c914a..0000000000000
--- a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z13 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
-
-define void @reduce(ptr %src, ptr %dst) {
-; CHECK-LABEL: 'reduce'
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
-; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
-; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
-;
-; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
-; CHECK:  Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
-
-  ; REDUCEADD64
-
-  %V2_64 = load <2 x i64>, ptr %src, align 8
-  %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64)
-  store volatile i64 %R2_64, ptr %dst, align 4
-
-  %V4_64 = load <4 x i64>, ptr %src, align 8
-  %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64)
-  store volatile i64 %R4_64, ptr %dst, align 4
-
-  %V8_64 = load <8 x i64>, ptr %src, align 8
-  %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64)
-  store volatile i64 %R8_64, ptr %dst, align 4
-
-  %V16_64 = load <16 x i64>, ptr %src, align 8
-  %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64)
-  store volatile i64 %R16_64, ptr %dst, align 4
-
-  ; REDUCEADD32
-
-  %V2_32 = load <2 x i32>, ptr %src, align 8
-  %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32)
-  store volatile i32 %R2_32, ptr %dst, align 4
-
-  %V4_32 = load <4 x i32>, ptr %src, align 8
-  %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32)
-  store volatile i32 %R4_32, ptr %dst, align 4
-
-  %V8_32 = load <8 x i32>, ptr %src, align 8
-  %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32)
-  store volatile i32 %R8_32, ptr %dst, align 4
-
-  %V16_32 = load <16 x i32>, ptr %src, align 8
-  %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32)
-  store volatile i32 %R16_32, ptr %dst, align 4
-
-  ; REDUCEADD16
-
-  %V2_16 = load <2 x i16>, ptr %src, align 8
-  %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16)
-  store volatile i16 %R2_16, ptr %dst, align 4
-
-  %V4_16 = load <4 x i16>, ptr %src, align 8
-  %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16)
-  store volatile i16 %R4_16, ptr %dst, align 4
-
-  %V8_16 = load <8 x i16>, ptr %src, align 8
-  %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16)
-  store volatile i16 %R8_16, ptr %dst, align 4
-
-  %V16_16 = load <16 x i16>, ptr %src, align 8
-  %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16)
-  store volatile i16 %R16_16, ptr %dst, align 4
-
-  ; REDUCEADD8
-
-  %V2_8 = load <2 x i8>, ptr %src, align 8
-  %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8)
-  store volatile i8 %R2_8, ptr %dst, align 4
-
-  %V4_8 = load <4 x i8>, ptr %src, align 8
-  %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8)
-  store volatile i8 %R4_8, ptr %dst, align 4
-
-  %V8_8 = load <8 x i8>, ptr %src, align 8
-  %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8)
-  store volatile i8 %R8_8, ptr %dst, align 4
-
-  %V16_8 = load <16 x i8>, ptr %src, align 8
-  %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8)
-  store volatile i8 %R16_8, ptr %dst, align 4
-
-  ; EXTREME VALUES
-
-  %V128_8 = load <128 x i8>, ptr %src, align 8
-  %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8)
-  store volatile i8 %R128_8, ptr %dst, align 4
-
-  %V4_256 = load <4 x i256>, ptr %src, align 8
-  %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256)
-  store volatile i256 %R4_256, ptr %dst, align 8
-
-  ret void
-}
-
-declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
-declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
-declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
-declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
-
-declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
-declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll
new file mode 100644
index 0000000000000..0def20215e988
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fadd_reductions(ptr %src, ptr %dst) {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+define void @reduceumin() {
+; Z15-LABEL: 'reduceumin'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
+;
+  %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
+
+  %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
+  %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
+
+  ret void
+}
+
+define void @reduceumax() {
+; Z15-LABEL: 'reduceumax'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
+;
+  %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
+
+  %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
+  %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
+
+  ret void
+}
+
+define void @reducesmin() {
+; Z15-LABEL: 'reducesmin'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
+;
+  %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
+
+  %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
+  %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
+
+  ret void
+}
+
+define void @reducesmax() {
+; Z15-LABEL: 'reducesmax'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
+;
+  %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+
+  %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+  %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
+
+  ret void
+}
+
+define void @reduceadd() {
+; Z15-LABEL: 'reduceadd'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+;
+; Z15-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)
+
+  ; REDUCEADD64
+  %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+  ; REDUCEADD32
+  %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+  %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+  ; REDUCEADD16
+  %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+  %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
+  %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+  %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+  ; REDUCEADD8
+  %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+  %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+  %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+  %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  ; EXTREME VALUES
+  %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+  %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)
+
+  ret void
+}
+
+define void @reducemul() {
+; CHECK-LABEL: 'reducemul'
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+;
+; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of 28 for instruction: %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)
+
+  ; REDUCEADD64
+  %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+  %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+  %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+  %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+  ; REDUCEADD32
+  %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
+  %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
+  %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
+  %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+  ; REDUCEADD16
+  %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
+  %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
+  %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
+  %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
+  ; REDUCEADD8
+  %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
+  %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
+  %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
+  %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
+  ; EXTREME VALUES
+  %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
+  %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)
+
+  ret void
+}
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
+
+declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
+declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
+declare i128 @llvm.vector.reduce.umin.v4i128(<4 x i128>)
+
+declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
+declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
+declare i128 @llvm.vector.reduce.umax.v4i128(<4 x i128>)
+
+declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
+declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
+declare i128 @llvm.vector.reduce.smin.v4i128(<4 x i128>)
+
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
+declare i128 @llvm.vector.reduce.smax.v4i128(<4 x i128>)
+
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)
+
+declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
+declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
+declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
+declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)
+
+declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
+declare i256 @llvm.vector.reduce.mul.v4i256(<4 x i256>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
index b045deebc56e0..25161652dafac 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -217,3 +217,55 @@ body:             |
     %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
     $q0 = COPY %large(<2 x s64>)
     $d0 = COPY %bv(<2 x s32>)
+...
+---
+name:            test_combine_anyext_undef
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_anyext_undef
+    ; CHECK-PRE: %aext:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-PRE-NEXT: $x0 = COPY %aext(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_anyext_undef
+    ; CHECK-POST: %undef:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-POST-NEXT: %aext:_(s64) = G_ANYEXT %undef(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %aext(s64)
+    %undef:_(s32) = G_IMPLICIT_DEF
+    %aext:_(s64) = G_ANYEXT %undef(s32)
+    $x0 = COPY %aext(s64)
+...
+---
+name:            test_combine_sext_undef
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_sext_undef
+    ; CHECK-PRE: %sext:_(s64) = G_CONSTANT i64 0
+    ; CHECK-PRE-NEXT: $x0 = COPY %sext(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_sext_undef
+    ; CHECK-POST: %undef:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-POST-NEXT: %sext:_(s64) = G_SEXT %undef(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %sext(s64)
+    %undef:_(s32) = G_IMPLICIT_DEF
+    %sext:_(s64) = G_SEXT %undef(s32)
+    $x0 = COPY %sext(s64)
+...
+---
+name:            test_combine_zext_undef
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_zext_undef
+    ; CHECK-PRE: %zext:_(s64) = G_CONSTANT i64 0
+    ; CHECK-PRE-NEXT: $x0 = COPY %zext(s64)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_zext_undef
+    ; CHECK-POST: %undef:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-POST-NEXT: %zext:_(s64) = G_ZEXT %undef(s32)
+    ; CHECK-POST-NEXT: $x0 = COPY %zext(s64)
+    %undef:_(s32) = G_IMPLICIT_DEF
+    %zext:_(s64) = G_ZEXT %undef(s32)
+    $x0 = COPY %zext(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/apply-disjoint-flag-in-dagcombine.ll b/llvm/test/CodeGen/AArch64/apply-disjoint-flag-in-dagcombine.ll
new file mode 100644
index 0000000000000..5622f2ae20efd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/apply-disjoint-flag-in-dagcombine.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi %s -o - | FileCheck %s
+
+define i32 @test(i32 %a) {
+; CHECK-LABEL: test:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w0, w0, #193
+; CHECK-NEXT:    ret
+entry:
+  %add = add i32 %a, 1
+  %or1 = or disjoint i32 %add, 64
+  %or = or disjoint i32 %or1, 128
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-nsconst.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-nsconst.ll
new file mode 100644
index 0000000000000..490a778f69e26
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-nsconst.ll
@@ -0,0 +1,32 @@
+; This test verifies that global variables (ns constant) are hashed based on their initial contents,
+; allowing them to be merged even if they appear different due to their names.
+; Now they become identical functions that can be merged without creating a parameter
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true -global-merging-skip-no-params=false < %s | FileCheck %s
+
+; CHECK: _f1.Tgm
+; CHECK: _f2.Tgm
+
+%struct.__NSConstantString_tag = type { ptr, i32, ptr, i64 }
+@__CFConstantStringClassReference = external global [0 x i32]
+@.str.2 = private unnamed_addr constant [9 x i8] c"cfstring\00", section "__TEXT,__cstring,cstring_literals", align 1
+@_unnamed_cfstring_ = private global %struct.__NSConstantString_tag { ptr @__CFConstantStringClassReference, i32 1992, ptr @.str.2, i64 8 }, section "__DATA,__cfstring", align 8
+
+@.str.3 = private unnamed_addr constant [9 x i8] c"cfstring\00", section "__TEXT,__cstring,cstring_literals", align 1
+@_unnamed_cfstring_.2 = private global %struct.__NSConstantString_tag { ptr @__CFConstantStringClassReference, i32 1992, ptr @.str.3, i64 8 }, section "__DATA,__cfstring", align 8
+
+declare i32 @hoo(ptr noundef)
+
+define i32 @f1() {
+entry:
+  %call = tail call i32 @hoo(ptr noundef nonnull @_unnamed_cfstring_)
+  %add = sub nsw i32 %call, 1
+  ret i32 %add
+}
+
+define i32 @f2() {
+entry:
+  %call = tail call i32 @hoo(ptr noundef nonnull @_unnamed_cfstring_.2)
+  %add = sub nsw i32 %call, 1
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-objc.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-objc.ll
new file mode 100644
index 0000000000000..0073114941501
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-objc.ll
@@ -0,0 +1,38 @@
+; This test verifies that global variables (objc metadata) are hashed based on their initial contents,
+; allowing them to be merged even if they appear different due to their names.
+; Now they become identical functions that can be merged without creating a parameter
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true -global-merging-skip-no-params=false < %s | FileCheck %s
+
+; CHECK: _f1.Tgm
+; CHECK: _f2.Tgm
+
+%struct._class_t = type { ptr, ptr, ptr, ptr, ptr }
+
+@"OBJC_CLASS_$_MyClass" = external global %struct._class_t
+@"OBJC_CLASSLIST_REFERENCES_$_" = internal global ptr @"OBJC_CLASS_$_MyClass", section "__DATA,__objc_classrefs,regular,no_dead_strip", align 8
+@"OBJC_CLASSLIST_REFERENCES_$_.1" = internal global ptr @"OBJC_CLASS_$_MyClass", section "__DATA,__objc_classrefs,regular,no_dead_strip", align 8
+
+@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"hello\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [6 x i8] c"hello\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+
+@OBJC_SELECTOR_REFERENCES_ = internal externally_initialized global ptr @OBJC_METH_VAR_NAME_, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip", align 8
+@OBJC_SELECTOR_REFERENCES_.1 = internal externally_initialized global ptr @OBJC_METH_VAR_NAME_.1, section "__DATA,__objc_selrefs,literal_pointers,no_dead_strip", align 8
+
+declare ptr @objc_msgSend(ptr, ptr, ...)
+
+define i32 @f1() {
+entry:
+  %0 = load ptr, ptr @"OBJC_CLASSLIST_REFERENCES_$_", align 8
+  %1 = load ptr, ptr @OBJC_SELECTOR_REFERENCES_, align 8
+  %call = tail call i32 @objc_msgSend(ptr noundef %0, ptr noundef %1)
+  ret i32 %call
+}
+
+define i32 @f2() {
+entry:
+  %0 = load ptr, ptr @"OBJC_CLASSLIST_REFERENCES_$_.1", align 8
+  %1 = load ptr, ptr @OBJC_SELECTOR_REFERENCES_.1, align 8
+  %call = tail call i32 @objc_msgSend(ptr noundef %0, ptr noundef %1)
+  ret i32 %call
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-string.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-string.ll
new file mode 100644
index 0000000000000..1e67425f0b847
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-merge-gvar-string.ll
@@ -0,0 +1,46 @@
+; This test verifies that global variables (string) are hashed based on their initial contents,
+; allowing them to be merged even if they appear different due to their names.
+; Now they become identical functions that can be merged without creating a parameter.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true -global-merging-skip-no-params=false < %s | FileCheck %s
+
+; CHECK: _f1.Tgm
+; CHECK: _f2.Tgm
+; CHECK-NOT: _f3.Tgm
+; CHECK-NOT: _f4.Tgm
+
+; The initial contents of `.str` and `.str.1` are identical, but not with those of `.str.2` and `.str.3`.
+@.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@.str.1 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@.str.2 = private unnamed_addr constant [6 x i8] c"diff2\00", align 1
+@.str.3 = private unnamed_addr constant [6 x i8] c"diff3\00", align 1
+
+declare i32 @goo(ptr noundef)
+
+define i32 @f1() {
+entry:
+  %call = tail call i32 @goo(ptr noundef nonnull @.str)
+  %add = add nsw i32 %call, 1
+  ret i32 %add
+}
+
+define i32 @f2() {
+entry:
+  %call = tail call i32 @goo(ptr noundef nonnull @.str.1)
+  %add = add nsw i32 %call, 1
+  ret i32 %add
+}
+
+define i32 @f3() {
+entry:
+  %call = tail call noundef i32 @goo(ptr noundef nonnull @.str.2)
+  %add = sub nsw i32 %call, 1
+  ret i32 %add
+}
+
+define i32 @f4() {
+entry:
+  %call = tail call noundef i32 @goo(ptr noundef nonnull @.str.3)
+  %add = sub nsw i32 %call, 1
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-outline-gvar.ll b/llvm/test/CodeGen/AArch64/cgdata-outline-gvar.ll
new file mode 100644
index 0000000000000..63ba1d491f9c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-outline-gvar.ll
@@ -0,0 +1,64 @@
+; This test verifies that global variables are hashed based on their initial contents,
+; allowing them to be outlined even if they appear different due to their names.
+
+; RUN: split-file %s %t
+
+; The outlined function is created locally.
+; Note that `.str.3` is commonly used in both `f1()` and `f2()`.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate -aarch64-enable-collect-loh=false  \
+; RUN:   %t/local-two.ll -o -  | FileCheck %s --check-prefix=WRITE
+
+; WRITE-LABEL: _OUTLINED_FUNCTION_{{.*}}:
+; WRITE:      adrp x1, l_.str.3
+; WRITE-NEXT: add x1, x1, l_.str.3
+; WRITE-NEXT: mov w2
+; WRITE-NEXT: mov w3
+; WRITE-NEXT: mov w4
+; WRITE-NEXT: b
+
+; Create an object file and merge it into the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate -aarch64-enable-collect-loh=false \
+; RUN:   -filetype=obj %t/local-two.ll -o %t_write_base
+; RUN: llvm-cgdata --merge %t_write_base -o %t_cgdata_base
+
+; Read the cgdata in the machine outliner for optimistically outlining in local-one.ll.
+; Note that the hash of `.str.5` in local-one.ll matches that of `.str.3` in an outlined tree in the cgdata.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_base  -aarch64-enable-collect-loh=false \
+; RUN:   %t/local-one.ll -o -  | FileCheck %s --check-prefix=READ
+
+; READ-LABEL: _OUTLINED_FUNCTION_{{.*}}:
+; READ:      adrp x1, l_.str.5
+; READ-NEXT: add x1, x1, l_.str.5
+; READ-NEXT: mov w2
+; READ-NEXT: mov w3
+; READ-NEXT: mov w4
+; READ-NEXT: b
+
+;--- local-two.ll
+@.str.1 = private unnamed_addr constant [3 x i8] c"f1\00", align 1
+@.str.2 = private unnamed_addr constant [3 x i8] c"f2\00", align 1
+@.str.3 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+
+declare noundef i32 @goo(ptr noundef, ptr noundef, i32, i32, i32)
+define i32 @f1() minsize {
+entry:
+  %call = tail call noundef i32 @goo(ptr noundef nonnull @.str.1, ptr noundef nonnull @.str.3, i32 1, i32 2, i32 3)
+  ret i32 %call
+}
+define i32 @f2() minsize {
+entry:
+  %call = tail call noundef i32 @goo(ptr noundef nonnull @.str.2, ptr noundef nonnull @.str.3, i32 1, i32 2, i32 3)
+  ret i32 %call
+}
+
+;--- local-one.ll
+@.str.4 = private unnamed_addr constant [3 x i8] c"f3\00", align 1
+@.str.5 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+
+declare noundef i32 @goo(ptr noundef, ptr noundef, i32, i32, i32)
+define i32 @f1() minsize {
+entry:
+  %call = tail call noundef i32 @goo(ptr noundef nonnull @.str.4, ptr noundef nonnull @.str.5, i32 1, i32 2, i32 3)
+  ret i32 %call
+}
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 36583b89ce5fc..0daa6e7f16202 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -385,19 +385,11 @@ entry:
 }
 
 define <8 x i16> @concat_high_high_v8i16(<8 x i16> %a_vec, <8 x i16> %b_vec) {
-; CHECK-SD-LABEL: concat_high_high_v8i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_high_high_v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_high_high_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <8 x i16> %a_vec, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i = shufflevector <8 x i16> %b_vec, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -406,19 +398,11 @@ entry:
 }
 
 define <8 x half> @concat_high_high_v8f16(<8 x half> %a_vec, <8 x half> %b_vec) {
-; CHECK-SD-LABEL: concat_high_high_v8f16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_high_high_v8f16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_high_high_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <8 x half> %a_vec, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i = shufflevector <8 x half> %b_vec, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -427,19 +411,11 @@ entry:
 }
 
 define <8 x bfloat> @concat_high_high_v8bf16(<8 x bfloat> %a_vec, <8 x bfloat> %b_vec) {
-; CHECK-SD-LABEL: concat_high_high_v8bf16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_high_high_v8bf16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_high_high_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <8 x bfloat> %a_vec, <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i = shufflevector <8 x bfloat> %b_vec, <8 x bfloat> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -455,9 +431,8 @@ define <4 x i32> @concat_high_high_v4i32(<4 x i32> %a_vec, <4 x i32> %b_vec) {
 ;
 ; CHECK-GI-LABEL: concat_high_high_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <4 x i32> %a_vec, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
@@ -474,9 +449,8 @@ define <4 x float> @concat_high_high_v4f32(<4 x float> %a_vec, <4 x float> %b_ve
 ;
 ; CHECK-GI-LABEL: concat_high_high_v4f32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <4 x float> %a_vec, <4 x float> poison, <2 x i32> <i32 2, i32 3>
@@ -486,19 +460,11 @@ entry:
 }
 
 define <16 x i8> @concat_high_high_v16i8(<16 x i8> %a_vec, <16 x i8> %b_vec) {
-; CHECK-SD-LABEL: concat_high_high_v16i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_high_high_v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_high_high_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v1.d[0], v0.d[1]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i3 = shufflevector <16 x i8> %a_vec, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i = shufflevector <16 x i8> %b_vec, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 5e5fdd6d31705..e89e1516fb1f5 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -8,17 +8,10 @@
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const
 
 define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) {
-; CHECK-SD-LABEL: extract_v2i64_undef_index:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: extract_v2i64_undef_index:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str q0, [sp, #-16]!
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    ldr x0, [sp], #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: extract_v2i64_undef_index:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
 entry:
   %d = extractelement <2 x i64> %a, i32 undef
   ret i64 %d
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-merge-past-memcpy.mir b/llvm/test/CodeGen/AArch64/stack-tagging-merge-past-memcpy.mir
new file mode 100644
index 0000000000000..45f6bfe80ac2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-merge-past-memcpy.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+  target triple = "aarch64-unknown-none-elf"
+
+  @glob = global [8 x i32] zeroinitializer, align 4
+
+  declare dso_local void @F78(ptr %B)
+
+  define void @F55() sanitize_memtag "target-features"="+mte,+strict-align" {
+  entry:
+    %basetag = call ptr @llvm.aarch64.irg.sp(i64 0)
+    %A = alloca i32, i32 8, align 16
+    %A.tag = call ptr @llvm.aarch64.tagp.p0(ptr %A, ptr %basetag, i64 0)
+    %B = alloca i32, i32 8, align 4
+    %C = alloca i32, i32 8, align 16
+    %C.tag = call ptr @llvm.aarch64.tagp.p0(ptr %C, ptr %basetag, i64 1)
+    call void @llvm.aarch64.settag(ptr %C.tag, i64 32)
+    call void @F56(ptr %C.tag)
+    call void @llvm.lifetime.start.p0(i64 32, ptr %A)
+    call void @llvm.aarch64.settag(ptr %A.tag, i64 32)
+    call void @F56(ptr %A.tag)
+    call void @llvm.aarch64.settag(ptr %A, i64 32)
+    call void @llvm.lifetime.end.p0(i64 32, ptr %A)
+    call void @llvm.lifetime.start.p0(i64 32, ptr %A)
+    call void @llvm.memcpy.p0.p0.i64(ptr align 4 %A, ptr align 4 @glob, i64 32, i1 false)
+    call void @F78(ptr %A)
+    call void @llvm.lifetime.end.p0(i64 32, ptr %A)
+    call void @llvm.aarch64.settag(ptr %C, i64 32)
+    ret void
+  }
+
+  declare void @F56(ptr)
+...
+---
+name:            F55
+frameInfo:
+  adjustsStack:    true
+stack:
+  - { id: 0, name: A, type: default, offset: 0, size: 32, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -32, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: C, type: default, offset: 0, size: 32, alignment: 16,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -64, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: F55
+    ; CHECK: liveins: $x19, $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: frame-setup EMITMTETAGGED
+    ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 80, 0
+    ; CHECK-NEXT: frame-setup STPXi killed $lr, killed $x19, $sp, 8 :: (store (s64) into %stack.3), (store (s64) into %stack.2)
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 80
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w19, -8
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -16
+    ; CHECK-NEXT: renamable $x0 = IRGstack $sp, $xzr
+    ; CHECK-NEXT: renamable $x19 = TAGPstack $x0, 2, renamable $x0, 1
+    ; CHECK-NEXT: ST2Gi renamable $x0, renamable $x0, 0 :: (store (s256) into %ir.C.tag, align 16)
+    ; CHECK-NEXT: BL @F56, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ; CHECK-NEXT: ST2Gi renamable $x19, renamable $x19, 0 :: (store (s256) into %ir.A.tag, align 16)
+    ; CHECK-NEXT: $x0 = COPY killed renamable $x19
+    ; CHECK-NEXT: BL @F56, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ; CHECK-NEXT: ST2Gi $sp, $sp, 2 :: (store (s256) into %ir.A, align 16)
+    ; CHECK-NEXT: renamable $x1 = LOADgot target-flags(aarch64-got) @glob
+    ; CHECK-NEXT: $x0 = ADDXri $sp, 32, 0
+    ; CHECK-NEXT: dead $w2 = MOVi32imm 32, implicit-def $x2
+    ; CHECK-NEXT: BL &memcpy, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit-def $sp, implicit-def dead $x0
+    ; CHECK-NEXT: $x0 = ADDXri $sp, 32, 0
+    ; CHECK-NEXT: BL @F78, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ; CHECK-NEXT: ST2Gi $sp, $sp, 0 :: (store (s256) into %ir.C, align 16)
+    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0
+    ; CHECK-NEXT: early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.3), (load (s64) from %stack.2)
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $x0 = IRGstack $sp, $xzr
+    renamable $x19 = TAGPstack %stack.0.A, 0, renamable $x0, 1
+    ST2Gi renamable $x0, renamable $x0, 0 :: (store (s256) into %ir.C.tag, align 16)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @F56, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ST2Gi renamable $x19, renamable $x19, 0 :: (store (s256) into %ir.A.tag, align 16)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = COPY killed renamable $x19
+    BL @F56, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ST2Gi $sp, %stack.0.A, 0 :: (store (s256) into %ir.A, align 16)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $x1 = LOADgot target-flags(aarch64-got) @glob
+    $x0 = ADDXri %stack.0.A, 0, 0
+    dead $w2 = MOVi32imm 32, implicit-def $x2
+    BL &memcpy, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $x1, implicit $x2, implicit-def $sp, implicit-def dead $x0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.A, 0, 0
+    BL @F78, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ST2Gi $sp, %stack.2.C, 0 :: (store (s256) into %ir.C, align 16)
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 184aa0226fe77..8473f45f6c803 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4885,8 +4885,7 @@ entry:
 define i32 @extract_hi_hi(<8 x i16> %a) {
 ; CHECK-SD-LABEL: extract_hi_hi:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-SD-NEXT:    mov v0.d[0], v0.d[1]
 ; CHECK-SD-NEXT:    uaddlv s0, v0.8h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
index 7893bfa1d38f0..9b39afd32ac37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir
@@ -261,8 +261,7 @@ body:             |
     ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_16
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16)
+    ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext
     ; CHECK-NEXT: $vgpr0 = COPY %result(s32)
     %arg:_(s32) = COPY $vgpr0
@@ -284,8 +283,7 @@ body:             |
     ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_24
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16)
+    ; CHECK-NEXT: %zext:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext
     ; CHECK-NEXT: $vgpr0 = COPY %result(s32)
     %arg:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir
new file mode 100644
index 0000000000000..e840c3f1b86ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir
@@ -0,0 +1,858 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name: uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: uniform_in_vgpr
+    ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_FPTOUI %0(s32)
+    %6:_(s32) = G_ADD %5, %1
+    G_STORE %6(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: back_to_back_uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: back_to_back_uniform_in_vgpr
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FADD %0, %1
+    %7:_(s32) = G_FPTOUI %6(s32)
+    %8:_(s32) = G_ADD %7, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_uniform
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: buffer_load_uniform
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[C1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $sgpr4
+    %6:_(s32) = COPY $vgpr0
+    %7:_(s32) = COPY $vgpr1
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_divergent
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: buffer_load_divergent
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[C1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $vgpr0
+    %6:_(s32) = COPY $vgpr1
+    %7:_(s32) = COPY $vgpr2
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_and_i64
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; CHECK-LABEL: name: vgpr_and_i64
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[MV1]]
+    ; CHECK-NEXT: G_STORE [[AND]](s64), [[MV2]](p1) :: (store (s64), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s64) = G_AND %2, %5
+    G_STORE %9(s64), %8(p1) :: (store (s64), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: abs_sgpr_i16
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: abs_sgpr_i16
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
+    ; CHECK-NEXT: G_STORE [[ANYEXT]](s32), [[MV]](p1) :: (store (s16), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s16) = G_ABS %1
+    %6:_(s32) = G_ANYEXT %5(s16)
+    G_STORE %6(s32), %4(p1) :: (store (s16), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: uniform_i1_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: uniform_i1_phi
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[PHI]](s1)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C3]]
+  ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $sgpr0
+    %4:_(s32) = COPY $sgpr1
+    %5:_(s32) = G_CONSTANT i32 6
+    %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(s1) = G_ICMP intpred(ne), %4(s32), %7
+    G_BRCOND %8(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %9:_(s32) = G_CONSTANT i32 1
+    %10:_(s1) = G_ICMP intpred(ult), %3(s32), %9
+
+  bb.2:
+    %11:_(s1) = G_PHI %6(s1), %bb.0, %10(s1), %bb.1
+    %12:_(s32) = G_SEXT %11(s1)
+    %13:_(s32) = G_CONSTANT i32 2
+    %14:_(s32) = G_ADD %12, %13
+    G_STORE %14(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vcc_to_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: vcc_to_scc
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FCONSTANT float 0.000000e+00
+    %7:_(s1) = G_FCMP floatpred(oeq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: scc_to_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: scc_to_vcc
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:_(s1) = G_ICMP intpred(eq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_to_vcc_trunc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; CHECK-LABEL: name: vgpr_to_vcc_trunc
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(s32) = COPY $vgpr4
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s1) = G_TRUNC %0(s32)
+    %7:_(s32) = G_SELECT %6(s1), %1, %2
+    G_STORE %7(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: zext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: zext
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK-NEXT: G_STORE [[ZEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_ZEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: sext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: sext
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
+    ; CHECK-NEXT: G_STORE [[SEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_SEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: and_i1_vcc
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: and_i1_scc
+    ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_phi_with_uniform_inputs
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: divergent_phi_with_uniform_inputs
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; CHECK-NEXT:   G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %7:_(s32) = G_CONSTANT i32 1
+
+  bb.2:
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %6(s32)
+    G_STORE %8(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_because_of_temporal_divergent_use
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: divergent_because_of_temporal_divergent_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C2]]
+  ; CHECK-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[ADD]](s32)
+  ; CHECK-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; CHECK-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(s32) = G_MUL [[PHI2]], [[C3]]
+  ; CHECK-NEXT:   G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 -1
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %10
+    %11:_(s32) = G_UITOFP %9(s32)
+    %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %13:_(s32) = G_PHI %9(s32), %bb.1
+    %14:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    %15:_(s32) = G_CONSTANT i32 10
+    %16:_(s32) = G_MUL %13, %15
+    G_STORE %16(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_2breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: loop_with_2breaks
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C5]]
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1)
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
+  ; CHECK-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C7]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
+  ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY9]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_IMPLICIT_DEF
+    %10:_(s32) = G_CONSTANT i32 0
+    %11:sreg_32(s1) = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %12:sreg_32(s1) = PHI %11(s1), %bb.0, %13(s1), %bb.3
+    %14:_(s32) = G_PHI %15(s32), %bb.3, %10(s32), %bb.0
+    %16:_(s32) = G_PHI %10(s32), %bb.0, %17(s32), %bb.3
+    %18:sreg_32(s1) = COPY %12(s1)
+    %19:_(s64) = G_SEXT %16(s32)
+    %20:_(s32) = G_CONSTANT i32 2
+    %21:_(s64) = G_SHL %19, %20(s32)
+    %22:_(p1) = G_PTR_ADD %5, %21(s64)
+    %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
+    %24:_(s32) = G_CONSTANT i32 0
+    %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
+    %26:_(s1) = G_CONSTANT i1 true
+    %27:sreg_32(s1) = COPY %26(s1)
+    %28:sreg_32(s1) = S_ANDN2_B32 %18(s1), $exec_lo, implicit-def $scc
+    %29:sreg_32(s1) = S_AND_B32 $exec_lo, %27(s1), implicit-def $scc
+    %30:sreg_32(s1) = S_OR_B32 %28(s1), %29(s1), implicit-def $scc
+    %31:sreg_32(s1) = COPY %30(s1)
+    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %33:_(s32) = G_CONSTANT i32 2
+    %34:_(s64) = G_SHL %19, %33(s32)
+    %35:_(p1) = G_PTR_ADD %8, %34(s64)
+    %36:_(s32) = G_LOAD %35(p1) :: (load (s32), addrspace 1)
+    %37:_(s32) = G_CONSTANT i32 0
+    %38:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %36(s32), %37
+    %39:_(s1) = G_CONSTANT i1 true
+    %40:sreg_32(s1) = COPY %39(s1)
+    %41:sreg_32(s1) = COPY %40(s1)
+    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %13:sreg_32(s1) = PHI %30(s1), %bb.1, %43(s1), %bb.5
+    %17:_(s32) = G_PHI %44(s32), %bb.5, %9(s32), %bb.1
+    %45:sreg_32(s1) = COPY %13(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %32(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %45(s1), %14(s32)
+    SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %46:_(s32) = G_CONSTANT i32 2
+    %47:_(s64) = G_SHL %19, %46(s32)
+    %48:_(p1) = G_PTR_ADD %2, %47(s64)
+    %49:_(s32) = G_LOAD %48(p1) :: (load (s32), addrspace 1)
+    %50:_(s32) = G_CONSTANT i32 1
+    %51:_(s32) = G_ADD %49, %50
+    G_STORE %51(s32), %48(p1) :: (store (s32), addrspace 1)
+    %52:_(s32) = G_ADD %16, %50
+    %53:_(s32) = G_CONSTANT i32 100
+    %54:_(s1) = G_ICMP intpred(ult), %16(s32), %53
+    %55:sreg_32(s1) = COPY %54(s1)
+    %56:sreg_32(s1) = S_ANDN2_B32 %41(s1), $exec_lo, implicit-def $scc
+    %57:sreg_32(s1) = S_AND_B32 $exec_lo, %55(s1), implicit-def $scc
+    %58:sreg_32(s1) = S_OR_B32 %56(s1), %57(s1), implicit-def $scc
+
+  bb.5:
+    successors: %bb.3(0x80000000)
+
+    %59:sreg_32(s1) = PHI %40(s1), %bb.2, %58(s1), %bb.4
+    %44:_(s32) = G_PHI %52(s32), %bb.4, %9(s32), %bb.2
+    %60:sreg_32(s1) = COPY %59(s1)
+    %61:sreg_32(s1) = COPY %60(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %42(s32)
+    %62:sreg_32(s1) = S_ANDN2_B32 %31(s1), $exec_lo, implicit-def $scc
+    %63:sreg_32(s1) = S_AND_B32 $exec_lo, %61(s1), implicit-def $scc
+    %43:sreg_32(s1) = S_OR_B32 %62(s1), %63(s1), implicit-def $scc
+    G_BR %bb.3
+
+  bb.6:
+    %64:_(s32) = G_PHI %15(s32), %bb.3
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %64(s32)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir
new file mode 100644
index 0000000000000..e840c3f1b86ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir
@@ -0,0 +1,858 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %s -verify-machineinstrs -o - | FileCheck %s
+
+---
+name: uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: uniform_in_vgpr
+    ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[COPY]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_FPTOUI %0(s32)
+    %6:_(s32) = G_ADD %5, %1
+    G_STORE %6(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: back_to_back_uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: back_to_back_uniform_in_vgpr
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD]](s32)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FADD %0, %1
+    %7:_(s32) = G_FPTOUI %6(s32)
+    %8:_(s32) = G_ADD %7, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_uniform
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: buffer_load_uniform
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[C1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $sgpr4
+    %6:_(s32) = COPY $vgpr0
+    %7:_(s32) = COPY $vgpr1
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_divergent
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: buffer_load_divergent
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[C1]]
+    ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $vgpr0
+    %6:_(s32) = COPY $vgpr1
+    %7:_(s32) = COPY $vgpr2
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_and_i64
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; CHECK-LABEL: name: vgpr_and_i64
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[MV1]]
+    ; CHECK-NEXT: G_STORE [[AND]](s64), [[MV2]](p1) :: (store (s64), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s64) = G_AND %2, %5
+    G_STORE %9(s64), %8(p1) :: (store (s64), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: abs_sgpr_i16
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: abs_sgpr_i16
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(s16) = G_ABS [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ABS]](s16)
+    ; CHECK-NEXT: G_STORE [[ANYEXT]](s32), [[MV]](p1) :: (store (s16), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s16) = G_ABS %1
+    %6:_(s32) = G_ANYEXT %5(s16)
+    G_STORE %6(s32), %4(p1) :: (store (s16), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: uniform_i1_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: uniform_i1_phi
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[PHI]](s1)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT]], [[C3]]
+  ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $sgpr0
+    %4:_(s32) = COPY $sgpr1
+    %5:_(s32) = G_CONSTANT i32 6
+    %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(s1) = G_ICMP intpred(ne), %4(s32), %7
+    G_BRCOND %8(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %9:_(s32) = G_CONSTANT i32 1
+    %10:_(s1) = G_ICMP intpred(ult), %3(s32), %9
+
+  bb.2:
+    %11:_(s1) = G_PHI %6(s1), %bb.0, %10(s1), %bb.1
+    %12:_(s32) = G_SEXT %11(s1)
+    %13:_(s32) = G_CONSTANT i32 2
+    %14:_(s32) = G_ADD %12, %13
+    G_STORE %14(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vcc_to_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: vcc_to_scc
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FCONSTANT float 0.000000e+00
+    %7:_(s1) = G_FCMP floatpred(oeq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: scc_to_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: scc_to_vcc
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:_(s1) = G_ICMP intpred(eq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_to_vcc_trunc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; CHECK-LABEL: name: vgpr_to_vcc_trunc
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(s32) = COPY $vgpr4
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s1) = G_TRUNC %0(s32)
+    %7:_(s32) = G_SELECT %6(s1), %1, %2
+    G_STORE %7(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: zext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: zext
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+    ; CHECK-NEXT: G_STORE [[ZEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_ZEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: sext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: sext
+    ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
+    ; CHECK-NEXT: G_STORE [[SEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_SEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; CHECK-LABEL: name: and_i1_vcc
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: and_i1_scc
+    ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_phi_with_uniform_inputs
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: divergent_phi_with_uniform_inputs
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; CHECK-NEXT:   G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %7:_(s32) = G_CONSTANT i32 1
+
+  bb.2:
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %6(s32)
+    G_STORE %8(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_because_of_temporal_divergent_use
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: divergent_because_of_temporal_divergent_use
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C2]]
+  ; CHECK-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[ADD]](s32)
+  ; CHECK-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; CHECK-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.1
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; CHECK-NEXT:   [[MUL:%[0-9]+]]:_(s32) = G_MUL [[PHI2]], [[C3]]
+  ; CHECK-NEXT:   G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 -1
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %10
+    %11:_(s32) = G_UITOFP %9(s32)
+    %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %13:_(s32) = G_PHI %9(s32), %bb.1
+    %14:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    %15:_(s32) = G_CONSTANT i32 10
+    %16:_(s32) = G_MUL %13, %15
+    G_STORE %16(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_2breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: loop_with_2breaks
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C5]]
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1)
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
+  ; CHECK-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   G_BR %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; CHECK-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C7]](s32)
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
+  ; CHECK-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY9]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; CHECK-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; CHECK-NEXT:   G_BR %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.3
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_IMPLICIT_DEF
+    %10:_(s32) = G_CONSTANT i32 0
+    %11:sreg_32(s1) = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %12:sreg_32(s1) = PHI %11(s1), %bb.0, %13(s1), %bb.3
+    %14:_(s32) = G_PHI %15(s32), %bb.3, %10(s32), %bb.0
+    %16:_(s32) = G_PHI %10(s32), %bb.0, %17(s32), %bb.3
+    %18:sreg_32(s1) = COPY %12(s1)
+    %19:_(s64) = G_SEXT %16(s32)
+    %20:_(s32) = G_CONSTANT i32 2
+    %21:_(s64) = G_SHL %19, %20(s32)
+    %22:_(p1) = G_PTR_ADD %5, %21(s64)
+    %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
+    %24:_(s32) = G_CONSTANT i32 0
+    %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
+    %26:_(s1) = G_CONSTANT i1 true
+    %27:sreg_32(s1) = COPY %26(s1)
+    %28:sreg_32(s1) = S_ANDN2_B32 %18(s1), $exec_lo, implicit-def $scc
+    %29:sreg_32(s1) = S_AND_B32 $exec_lo, %27(s1), implicit-def $scc
+    %30:sreg_32(s1) = S_OR_B32 %28(s1), %29(s1), implicit-def $scc
+    %31:sreg_32(s1) = COPY %30(s1)
+    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %33:_(s32) = G_CONSTANT i32 2
+    %34:_(s64) = G_SHL %19, %33(s32)
+    %35:_(p1) = G_PTR_ADD %8, %34(s64)
+    %36:_(s32) = G_LOAD %35(p1) :: (load (s32), addrspace 1)
+    %37:_(s32) = G_CONSTANT i32 0
+    %38:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %36(s32), %37
+    %39:_(s1) = G_CONSTANT i1 true
+    %40:sreg_32(s1) = COPY %39(s1)
+    %41:sreg_32(s1) = COPY %40(s1)
+    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %13:sreg_32(s1) = PHI %30(s1), %bb.1, %43(s1), %bb.5
+    %17:_(s32) = G_PHI %44(s32), %bb.5, %9(s32), %bb.1
+    %45:sreg_32(s1) = COPY %13(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %32(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %45(s1), %14(s32)
+    SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %46:_(s32) = G_CONSTANT i32 2
+    %47:_(s64) = G_SHL %19, %46(s32)
+    %48:_(p1) = G_PTR_ADD %2, %47(s64)
+    %49:_(s32) = G_LOAD %48(p1) :: (load (s32), addrspace 1)
+    %50:_(s32) = G_CONSTANT i32 1
+    %51:_(s32) = G_ADD %49, %50
+    G_STORE %51(s32), %48(p1) :: (store (s32), addrspace 1)
+    %52:_(s32) = G_ADD %16, %50
+    %53:_(s32) = G_CONSTANT i32 100
+    %54:_(s1) = G_ICMP intpred(ult), %16(s32), %53
+    %55:sreg_32(s1) = COPY %54(s1)
+    %56:sreg_32(s1) = S_ANDN2_B32 %41(s1), $exec_lo, implicit-def $scc
+    %57:sreg_32(s1) = S_AND_B32 $exec_lo, %55(s1), implicit-def $scc
+    %58:sreg_32(s1) = S_OR_B32 %56(s1), %57(s1), implicit-def $scc
+
+  bb.5:
+    successors: %bb.3(0x80000000)
+
+    %59:sreg_32(s1) = PHI %40(s1), %bb.2, %58(s1), %bb.4
+    %44:_(s32) = G_PHI %52(s32), %bb.4, %9(s32), %bb.2
+    %60:sreg_32(s1) = COPY %59(s1)
+    %61:sreg_32(s1) = COPY %60(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %42(s32)
+    %62:sreg_32(s1) = S_ANDN2_B32 %31(s1), $exec_lo, implicit-def $scc
+    %63:sreg_32(s1) = S_AND_B32 $exec_lo, %61(s1), implicit-def $scc
+    %43:sreg_32(s1) = S_OR_B32 %62(s1), %63(s1), implicit-def $scc
+    G_BR %bb.3
+
+  bb.6:
+    %64:_(s32) = G_PHI %15(s32), %bb.3
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %64(s32)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll
new file mode 100644
index 0000000000000..0b4eb458b254f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS_GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=OLD_RBS_GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS_GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=NEW_RBS_GFX12 %s
+
+define amdgpu_ps void @salu_float(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
+; OLD_RBS_GFX10-LABEL: salu_float:
+; OLD_RBS_GFX10:       ; %bb.0:
+; OLD_RBS_GFX10-NEXT:    v_add_f32_e64 v2, s0, s1
+; OLD_RBS_GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; OLD_RBS_GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; OLD_RBS_GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS_GFX10-NEXT:    s_endpgm
+;
+; OLD_RBS_GFX12-LABEL: salu_float:
+; OLD_RBS_GFX12:       ; %bb.0:
+; OLD_RBS_GFX12-NEXT:    s_add_f32 s0, s0, s1
+; OLD_RBS_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; OLD_RBS_GFX12-NEXT:    s_cvt_u32_f32 s0, s0
+; OLD_RBS_GFX12-NEXT:    s_add_co_i32 s0, s0, s2
+; OLD_RBS_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; OLD_RBS_GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; OLD_RBS_GFX12-NEXT:    s_endpgm
+;
+; NEW_RBS_GFX10-LABEL: salu_float:
+; NEW_RBS_GFX10:       ; %bb.0:
+; NEW_RBS_GFX10-NEXT:    v_add_f32_e64 v2, s0, s1
+; NEW_RBS_GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; NEW_RBS_GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; NEW_RBS_GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS_GFX10-NEXT:    s_endpgm
+;
+; NEW_RBS_GFX12-LABEL: salu_float:
+; NEW_RBS_GFX12:       ; %bb.0:
+; NEW_RBS_GFX12-NEXT:    s_add_f32 s0, s0, s1
+; NEW_RBS_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; NEW_RBS_GFX12-NEXT:    s_cvt_u32_f32 s0, s0
+; NEW_RBS_GFX12-NEXT:    s_add_co_i32 s0, s0, s2
+; NEW_RBS_GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; NEW_RBS_GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS_GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; NEW_RBS_GFX12-NEXT:    s_endpgm
+  %add = fadd float %a, %b
+  %add.i32 = fptoui float %add to i32
+  %res = add i32 %add.i32, %c
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir
new file mode 100644
index 0000000000000..98a8f4f04e49d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX12
+
+---
+name: salu_float
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; OLD_RBS_GFX10-LABEL: name: salu_float
+    ; OLD_RBS_GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; OLD_RBS_GFX10-NEXT: {{  $}}
+    ; OLD_RBS_GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS_GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS_GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS_GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS_GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS_GFX10-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS_GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; OLD_RBS_GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; OLD_RBS_GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]]
+    ; OLD_RBS_GFX10-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; OLD_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; OLD_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]]
+    ; OLD_RBS_GFX10-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS_GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; OLD_RBS_GFX12-LABEL: name: salu_float
+    ; OLD_RBS_GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; OLD_RBS_GFX12-NEXT: {{  $}}
+    ; OLD_RBS_GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS_GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS_GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS_GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS_GFX12-NEXT: [[FADD:%[0-9]+]]:sgpr(s32) = G_FADD [[COPY]], [[COPY1]]
+    ; OLD_RBS_GFX12-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; OLD_RBS_GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY2]]
+    ; OLD_RBS_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+    ; OLD_RBS_GFX12-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS_GFX12-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS_GFX10-LABEL: name: salu_float
+    ; NEW_RBS_GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; NEW_RBS_GFX10-NEXT: {{  $}}
+    ; NEW_RBS_GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS_GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS_GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS_GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS_GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS_GFX10-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS_GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; NEW_RBS_GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; NEW_RBS_GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]]
+    ; NEW_RBS_GFX10-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; NEW_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; NEW_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]]
+    ; NEW_RBS_GFX10-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS_GFX10-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS_GFX12-LABEL: name: salu_float
+    ; NEW_RBS_GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; NEW_RBS_GFX12-NEXT: {{  $}}
+    ; NEW_RBS_GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS_GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS_GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS_GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS_GFX12-NEXT: [[FADD:%[0-9]+]]:sgpr(s32) = G_FADD [[COPY]], [[COPY1]]
+    ; NEW_RBS_GFX12-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; NEW_RBS_GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY2]]
+    ; NEW_RBS_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+    ; NEW_RBS_GFX12-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS_GFX12-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FADD %0, %1
+    %7:_(s32) = G_FPTOUI %6(s32)
+    %8:_(s32) = G_ADD %7, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
new file mode 100644
index 0000000000000..287a8ab0e52f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll
@@ -0,0 +1,635 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS %s
+
+; if instruction is uniform and there is available instruction, select SALU instruction
+define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: uniform_in_vgpr:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_cvt_u32_f32_e32 v2, s0
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, s1, v2
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: uniform_in_vgpr:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_cvt_u32_f32_e32 v2, s0
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v2, s1, v2
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %a.i32 = fptoui float %a to i32
+  %res = add i32 %a.i32, %b
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+; copy sgpr to vgpr + readfirstlane vgpr to sgpr combine from rb-legalize
+define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: back_to_back_uniform_in_vgpr:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_add_f32_e64 v2, s0, s1
+; OLD_RBS-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: back_to_back_uniform_in_vgpr:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_add_f32_e64 v2, s0, s1
+; NEW_RBS-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %add = fadd float %a, %b
+  %add.i32 = fptoui float %add to i32
+  %res = add i32 %add.i32, %c
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+; fast rules for vector instructions
+define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %voffset, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: buffer_load_uniform:
+; OLD_RBS:       ; %bb.0: ; %.entry
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s4
+; OLD_RBS-NEXT:    buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
+; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v2, 1, v3
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: buffer_load_uniform:
+; NEW_RBS:       ; %bb.0: ; %.entry
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s4
+; NEW_RBS-NEXT:    buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen
+; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v2, 1, v3
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+.entry:
+  %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
+  %el1 = extractelement <4 x i32> %vec, i64 1
+  %res = add i32 %el1, 1
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_cs void @buffer_load_divergent(<4 x i32> inreg %rsrc, i32 %voffset, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: buffer_load_divergent:
+; OLD_RBS:       ; %bb.0: ; %.entry
+; OLD_RBS-NEXT:    buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
+; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v0, 1, v4
+; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: buffer_load_divergent:
+; NEW_RBS:       ; %bb.0: ; %.entry
+; NEW_RBS-NEXT:    buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen
+; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v0, 1, v4
+; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+.entry:
+  %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
+  %el1 = extractelement <4 x i32> %vec, i64 1
+  %res = add i32 %el1, 1
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+;lowering in rb-legalize (sgpr S64 is legal, vgpr has to be split to S32)
+define amdgpu_ps void @vgpr_and_i64(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: vgpr_and_i64:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_and_b32_e32 v0, v0, v2
+; OLD_RBS-NEXT:    v_and_b32_e32 v1, v1, v3
+; OLD_RBS-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: vgpr_and_i64:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_and_b32_e32 v0, v0, v2
+; NEW_RBS-NEXT:    v_and_b32_e32 v1, v1, v3
+; NEW_RBS-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; NEW_RBS-NEXT:    s_endpgm
+  %res = and i64 %a, %b
+  store i64 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+; It is up to user instruction to deal with potential truncated bits in reg.
+; Here G_ABS needs to sign extend S16 in reg to S32 and then do S32 G_ABS.
+define amdgpu_ps void @abs_sgpr_i16(i16 inreg %arg, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: abs_sgpr_i16:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    s_sext_i32_i16 s0, s0
+; OLD_RBS-NEXT:    s_abs_i32 s0, s0
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS-NEXT:    global_store_short v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: abs_sgpr_i16:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    s_sext_i32_i16 s0, s0
+; NEW_RBS-NEXT:    s_abs_i32 s0, s0
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT:    global_store_short v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+  store i16 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 inreg %cond) {
+; OLD_RBS-LABEL: uniform_i1_phi:
+; OLD_RBS:       ; %bb.0: ; %A
+; OLD_RBS-NEXT:    s_cmp_ge_u32 s0, 6
+; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; OLD_RBS-NEXT:    s_cmp_lg_u32 s1, 0
+; OLD_RBS-NEXT:    s_cbranch_scc1 .LBB6_2
+; OLD_RBS-NEXT:  ; %bb.1: ; %B
+; OLD_RBS-NEXT:    s_cmp_lt_u32 s0, 1
+; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; OLD_RBS-NEXT:  .LBB6_2: ; %exit
+; OLD_RBS-NEXT:    s_bfe_i32 s0, s2, 0x10000
+; OLD_RBS-NEXT:    s_add_i32 s0, s0, 2
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: uniform_i1_phi:
+; NEW_RBS:       ; %bb.0: ; %A
+; NEW_RBS-NEXT:    s_cmp_ge_u32 s0, 6
+; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; NEW_RBS-NEXT:    s_cmp_lg_u32 s1, 0
+; NEW_RBS-NEXT:    s_cbranch_scc1 .LBB6_2
+; NEW_RBS-NEXT:  ; %bb.1: ; %B
+; NEW_RBS-NEXT:    s_cmp_lt_u32 s0, 1
+; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; NEW_RBS-NEXT:  .LBB6_2: ; %exit
+; NEW_RBS-NEXT:    s_bfe_i32 s0, s2, 0x10000
+; NEW_RBS-NEXT:    s_add_i32 s0, s0, 2
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+A:
+  %val_A = icmp uge i32 %tid, 6
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %B, label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 1
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %sel = select i1 %phi, i32 1, i32 2
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; this is kind of i1 readfirstlane
+; uniform i1 result on instruction that is only available on VALU
+define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: vcc_to_scc:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s2
+; OLD_RBS-NEXT:    v_cmp_eq_f32_e64 s0, s0, 0
+; OLD_RBS-NEXT:    v_cndmask_b32_e64 v2, v2, s1, s0
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: vcc_to_scc:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s2
+; NEW_RBS-NEXT:    v_cmp_eq_f32_e64 s0, s0, 0
+; NEW_RBS-NEXT:    v_cndmask_b32_e64 v2, v2, s1, s0
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %vcc_to_scc = fcmp oeq float %a, 0.0
+  %select = select i1 %vcc_to_scc, i32 %b, i32 %c
+  store i32 %select, ptr addrspace(1) %ptr
+  ret void
+}
+
+; combiner in rb-legalize recognizes sgpr S1 to vcc copy
+define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: scc_to_vcc:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 0
+; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; OLD_RBS-NEXT:    s_and_b32 s0, 1, s0
+; OLD_RBS-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; OLD_RBS-NEXT:    global_store_dword v[2:3], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: scc_to_vcc:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 0
+; NEW_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; NEW_RBS-NEXT:    s_and_b32 s0, 1, s0
+; NEW_RBS-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; NEW_RBS-NEXT:    global_store_dword v[2:3], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+  %scc_to_vcc = icmp eq i32 %a, 0
+  %select = select i1 %scc_to_vcc, i32 %b, i32 %c
+  store i32 %select, ptr addrspace(1) %ptr
+  ret void
+}
+
+; this is only G_TRUNC that is not no-op in global-isel for AMDGPU
+define amdgpu_ps void @vgpr_to_vcc_trunc(i32 %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: vgpr_to_vcc_trunc:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_and_b32_e32 v0, 1, v0
+; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; OLD_RBS-NEXT:    global_store_dword v[3:4], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: vgpr_to_vcc_trunc:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_and_b32_e32 v0, 1, v0
+; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; NEW_RBS-NEXT:    global_store_dword v[3:4], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+  %vcc = trunc i32 %a to i1
+  %select = select i1 %vcc, i32 %b, i32 %c
+  store i32 %select, ptr addrspace(1) %ptr
+  ret void
+}
+
+; i1 input to zext and sext is something that survived legalizer (not trunc)
+; lower to select
+define amdgpu_ps void @zext(i32 inreg %a, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: zext:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 10
+; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: zext:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 10
+; NEW_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %bool = icmp eq i32 %a, 10
+  %zext = zext i1 %bool to i32
+  store i32 %zext, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: sext:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    s_cmp_eq_u32 s0, 10
+; OLD_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; OLD_RBS-NEXT:    s_bfe_i32 s0, s0, 0x10000
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: sext:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    s_cmp_eq_u32 s0, 10
+; NEW_RBS-NEXT:    s_cselect_b32 s0, 1, 0
+; NEW_RBS-NEXT:    s_bfe_i32 s0, s0, 0x10000
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %bool = icmp eq i32 %a, 10
+  %sext = sext i1 %bool to i32
+  store i32 %sext, ptr addrspace(1) %ptr
+  ret void
+}
+
+; divergent i1 bitwise, i1 vcc.
+; inst selected into s_and_b32 on wave32 or s_and_b64 on wave64.
+define amdgpu_ps void @and_i1_vcc(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: and_i1_vcc:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    v_cmp_le_u32_e32 vcc_lo, 10, v0
+; OLD_RBS-NEXT:    v_cmp_le_u32_e64 s0, 20, v1
+; OLD_RBS-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; OLD_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; OLD_RBS-NEXT:    global_store_dword v[2:3], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: and_i1_vcc:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    v_cmp_le_u32_e32 vcc_lo, 10, v0
+; NEW_RBS-NEXT:    v_cmp_le_u32_e64 s0, 20, v1
+; NEW_RBS-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; NEW_RBS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; NEW_RBS-NEXT:    global_store_dword v[2:3], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+  %cmp_a = icmp uge i32 %a, 10
+  %cmp_b = icmp uge i32 %b, 20
+  %cc = and i1 %cmp_a, %cmp_b
+  %res = select i1 %cc, i32 %a, i32 %b
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+; uniform i1 bitwise, i32 sgpr. inst selected into s_and_b32.
+define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) {
+; OLD_RBS-LABEL: and_i1_scc:
+; OLD_RBS:       ; %bb.0:
+; OLD_RBS-NEXT:    s_cmp_ge_u32 s0, 10
+; OLD_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; OLD_RBS-NEXT:    s_cmp_ge_u32 s1, 20
+; OLD_RBS-NEXT:    s_cselect_b32 s3, 1, 0
+; OLD_RBS-NEXT:    s_and_b32 s2, s2, s3
+; OLD_RBS-NEXT:    s_and_b32 s2, s2, 1
+; OLD_RBS-NEXT:    s_cmp_lg_u32 s2, 0
+; OLD_RBS-NEXT:    s_cselect_b32 s0, s0, s1
+; OLD_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; OLD_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: and_i1_scc:
+; NEW_RBS:       ; %bb.0:
+; NEW_RBS-NEXT:    s_cmp_ge_u32 s0, 10
+; NEW_RBS-NEXT:    s_cselect_b32 s2, 1, 0
+; NEW_RBS-NEXT:    s_cmp_ge_u32 s1, 20
+; NEW_RBS-NEXT:    s_cselect_b32 s3, 1, 0
+; NEW_RBS-NEXT:    s_and_b32 s2, s2, s3
+; NEW_RBS-NEXT:    s_and_b32 s2, s2, 1
+; NEW_RBS-NEXT:    s_cmp_lg_u32 s2, 0
+; NEW_RBS-NEXT:    s_cselect_b32 s0, s0, s1
+; NEW_RBS-NEXT:    v_mov_b32_e32 v2, s0
+; NEW_RBS-NEXT:    global_store_dword v[0:1], v2, off
+; NEW_RBS-NEXT:    s_endpgm
+  %cmp_a = icmp uge i32 %a, 10
+  %cmp_b = icmp uge i32 %b, 20
+  %cc = and i1 %cmp_a, %cmp_b
+  %res = select i1 %cc, i32 %a, i32 %b
+  store i32 %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+; old RBS selects sgpr phi because it had sgpr inputs.
+define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) {
+; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs:
+; OLD_RBS:       ; %bb.0: ; %A
+; OLD_RBS-NEXT:    s_mov_b32 s0, 0
+; OLD_RBS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; OLD_RBS-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; OLD_RBS-NEXT:  ; %bb.1: ; %B
+; OLD_RBS-NEXT:    s_mov_b32 s0, 1
+; OLD_RBS-NEXT:  ; %bb.2: ; %exit
+; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; OLD_RBS-NEXT:    v_mov_b32_e32 v0, s0
+; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs:
+; NEW_RBS:       ; %bb.0: ; %A
+; NEW_RBS-NEXT:    s_mov_b32 s0, 0
+; NEW_RBS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; NEW_RBS-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; NEW_RBS-NEXT:  ; %bb.1: ; %B
+; NEW_RBS-NEXT:    s_mov_b32 s0, 1
+; NEW_RBS-NEXT:  ; %bb.2: ; %exit
+; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; NEW_RBS-NEXT:    v_mov_b32_e32 v0, s0
+; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+A:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %B, label %exit
+
+B:
+  br label %exit
+
+exit:
+  %phi = phi i32 [ 0, %A ], [ 1, %B ]
+  store i32 %phi, ptr addrspace(1) %out
+  ret void
+}
+
+; old RBS assigned vgpr to uniform phi (because one input had undetermined bank)
+; and it propagated to mul, which was not wrong.
+; new RBS assigns vgpr to destination of mul even though both inputs are sgpr.
+; TODO: implement temporal divergence lowering
+define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, ptr addrspace(1) %addr) {
+; OLD_RBS-LABEL: divergent_because_of_temporal_divergent_use:
+; OLD_RBS:       ; %bb.0: ; %entry
+; OLD_RBS-NEXT:    s_mov_b32 s0, -1
+; OLD_RBS-NEXT:    v_mov_b32_e32 v3, s0
+; OLD_RBS-NEXT:    s_mov_b32 s0, 0
+; OLD_RBS-NEXT:  .LBB15_1: ; %loop
+; OLD_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; OLD_RBS-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; OLD_RBS-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; OLD_RBS-NEXT:    s_or_b32 s0, vcc_lo, s0
+; OLD_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; OLD_RBS-NEXT:    s_cbranch_execnz .LBB15_1
+; OLD_RBS-NEXT:  ; %bb.2: ; %exit
+; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; OLD_RBS-NEXT:    v_mul_lo_u32 v0, v3, 10
+; OLD_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use:
+; NEW_RBS:       ; %bb.0: ; %entry
+; NEW_RBS-NEXT:    s_mov_b32 s0, -1
+; NEW_RBS-NEXT:    v_mov_b32_e32 v3, s0
+; NEW_RBS-NEXT:    s_mov_b32 s0, 0
+; NEW_RBS-NEXT:  .LBB15_1: ; %loop
+; NEW_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; NEW_RBS-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; NEW_RBS-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; NEW_RBS-NEXT:    s_or_b32 s0, vcc_lo, s0
+; NEW_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; NEW_RBS-NEXT:    s_cbranch_execnz .LBB15_1
+; NEW_RBS-NEXT:  ; %bb.2: ; %exit
+; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; NEW_RBS-NEXT:    v_mul_lo_u32 v0, v3, 10
+; NEW_RBS-NEXT:    global_store_dword v[1:2], v0, off
+; NEW_RBS-NEXT:    s_endpgm
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+  %f.counter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %f.counter, %val
+  %counter.plus.1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ceilx10 = mul i32 %counter, 10
+  store i32 %ceilx10, ptr addrspace(1) %addr
+  ret void
+}
+
+; Variables that hande counter can be allocated to sgprs.
+; Machine uniformity analysis claims some of those registers are divergent while
+; LLVM-IR uniformity analysis claims corresponding values are uniform.
+; TODO: fix this in Machine uniformity analysis.
+define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; OLD_RBS-LABEL: loop_with_2breaks:
+; OLD_RBS:       ; %bb.0: ; %entry
+; OLD_RBS-NEXT:    s_mov_b32 s0, 0
+; OLD_RBS-NEXT:    ; implicit-def: $sgpr1
+; OLD_RBS-NEXT:    v_mov_b32_e32 v6, s0
+; OLD_RBS-NEXT:    s_branch .LBB16_3
+; OLD_RBS-NEXT:  .LBB16_1: ; %Flow3
+; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; OLD_RBS-NEXT:    s_waitcnt_depctr 0xffe3
+; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; OLD_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; OLD_RBS-NEXT:    s_and_b32 s3, exec_lo, s4
+; OLD_RBS-NEXT:    s_or_b32 s1, s1, s3
+; OLD_RBS-NEXT:  .LBB16_2: ; %Flow
+; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; OLD_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; OLD_RBS-NEXT:    s_and_b32 s2, exec_lo, s1
+; OLD_RBS-NEXT:    s_or_b32 s0, s2, s0
+; OLD_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_6
+; OLD_RBS-NEXT:  .LBB16_3: ; %A
+; OLD_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; OLD_RBS-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; OLD_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; OLD_RBS-NEXT:    s_and_b32 s2, exec_lo, -1
+; OLD_RBS-NEXT:    s_or_b32 s1, s1, s2
+; OLD_RBS-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; OLD_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
+; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; OLD_RBS-NEXT:    global_load_dword v9, v[9:10], off
+; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
+; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; OLD_RBS-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_2
+; OLD_RBS-NEXT:  ; %bb.4: ; %B
+; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; OLD_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
+; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; OLD_RBS-NEXT:    s_mov_b32 s4, -1
+; OLD_RBS-NEXT:    global_load_dword v9, v[9:10], off
+; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
+; OLD_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; OLD_RBS-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; OLD_RBS-NEXT:    s_cbranch_execz .LBB16_1
+; OLD_RBS-NEXT:  ; %bb.5: ; %loop.body
+; OLD_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; OLD_RBS-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
+; OLD_RBS-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v10, 1, v6
+; OLD_RBS-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; OLD_RBS-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; OLD_RBS-NEXT:    global_load_dword v9, v[7:8], off
+; OLD_RBS-NEXT:    v_mov_b32_e32 v6, v10
+; OLD_RBS-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
+; OLD_RBS-NEXT:    s_or_b32 s4, s4, s5
+; OLD_RBS-NEXT:    s_waitcnt vmcnt(0)
+; OLD_RBS-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; OLD_RBS-NEXT:    global_store_dword v[7:8], v9, off
+; OLD_RBS-NEXT:    s_branch .LBB16_1
+; OLD_RBS-NEXT:  .LBB16_6: ; %exit
+; OLD_RBS-NEXT:    s_endpgm
+;
+; NEW_RBS-LABEL: loop_with_2breaks:
+; NEW_RBS:       ; %bb.0: ; %entry
+; NEW_RBS-NEXT:    s_mov_b32 s0, 0
+; NEW_RBS-NEXT:    ; implicit-def: $sgpr1
+; NEW_RBS-NEXT:    v_mov_b32_e32 v6, s0
+; NEW_RBS-NEXT:    s_branch .LBB16_3
+; NEW_RBS-NEXT:  .LBB16_1: ; %Flow3
+; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; NEW_RBS-NEXT:    s_waitcnt_depctr 0xffe3
+; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; NEW_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; NEW_RBS-NEXT:    s_and_b32 s3, exec_lo, s4
+; NEW_RBS-NEXT:    s_or_b32 s1, s1, s3
+; NEW_RBS-NEXT:  .LBB16_2: ; %Flow
+; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; NEW_RBS-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; NEW_RBS-NEXT:    s_and_b32 s2, exec_lo, s1
+; NEW_RBS-NEXT:    s_or_b32 s0, s2, s0
+; NEW_RBS-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_6
+; NEW_RBS-NEXT:  .LBB16_3: ; %A
+; NEW_RBS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; NEW_RBS-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; NEW_RBS-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; NEW_RBS-NEXT:    s_and_b32 s2, exec_lo, -1
+; NEW_RBS-NEXT:    s_or_b32 s1, s1, s2
+; NEW_RBS-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; NEW_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
+; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; NEW_RBS-NEXT:    global_load_dword v9, v[9:10], off
+; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
+; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; NEW_RBS-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_2
+; NEW_RBS-NEXT:  ; %bb.4: ; %B
+; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; NEW_RBS-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
+; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; NEW_RBS-NEXT:    s_mov_b32 s4, -1
+; NEW_RBS-NEXT:    global_load_dword v9, v[9:10], off
+; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
+; NEW_RBS-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; NEW_RBS-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; NEW_RBS-NEXT:    s_cbranch_execz .LBB16_1
+; NEW_RBS-NEXT:  ; %bb.5: ; %loop.body
+; NEW_RBS-NEXT:    ; in Loop: Header=BB16_3 Depth=1
+; NEW_RBS-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
+; NEW_RBS-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v10, 1, v6
+; NEW_RBS-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; NEW_RBS-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; NEW_RBS-NEXT:    global_load_dword v9, v[7:8], off
+; NEW_RBS-NEXT:    v_mov_b32_e32 v6, v10
+; NEW_RBS-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
+; NEW_RBS-NEXT:    s_or_b32 s4, s4, s5
+; NEW_RBS-NEXT:    s_waitcnt vmcnt(0)
+; NEW_RBS-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; NEW_RBS-NEXT:    global_store_dword v[7:8], v9, off
+; NEW_RBS-NEXT:    s_branch .LBB16_1
+; NEW_RBS-NEXT:  .LBB16_6: ; %exit
+; NEW_RBS-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
+  %a.val = load i32, ptr addrspace(1) %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %exit, label %B
+
+B:
+  %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter
+  %b.val = load i32, ptr addrspace(1) %b.plus.counter
+  %b.cond = icmp eq i32 %b.val, 0
+  br i1 %b.cond, label %exit, label %loop.body
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
+  %x.val = load i32, ptr addrspace(1) %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
+declare i16 @llvm.abs.i16(i16, i1)
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir
new file mode 100644
index 0000000000000..ef3a0a3a67594
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir
@@ -0,0 +1,1377 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS
+
+---
+name: uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: uniform_in_vgpr
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; OLD_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; OLD_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY5]]
+    ; OLD_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: uniform_in_vgpr
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY5]]
+    ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_FPTOUI %0(s32)
+    %6:_(s32) = G_ADD %5, %1
+    G_STORE %6(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: back_to_back_uniform_in_vgpr
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: back_to_back_uniform_in_vgpr
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; OLD_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; OLD_RBS-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]]
+    ; OLD_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; OLD_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; OLD_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]]
+    ; OLD_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: back_to_back_uniform_in_vgpr
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; NEW_RBS-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]]
+    ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32)
+    ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]]
+    ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FADD %0, %1
+    %7:_(s32) = G_FPTOUI %6(s32)
+    %8:_(s32) = G_ADD %7, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_uniform
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: buffer_load_uniform
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+    ; OLD_RBS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; OLD_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; OLD_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; OLD_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; OLD_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; OLD_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; OLD_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY9]]
+    ; OLD_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: buffer_load_uniform
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+    ; NEW_RBS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; NEW_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY9]]
+    ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $sgpr4
+    %6:_(s32) = COPY $vgpr0
+    %7:_(s32) = COPY $vgpr1
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: buffer_load_divergent
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    ; OLD_RBS-LABEL: name: buffer_load_divergent
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+    ; OLD_RBS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; OLD_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; OLD_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; OLD_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; OLD_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; OLD_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; OLD_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY8]]
+    ; OLD_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: buffer_load_divergent
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
+    ; NEW_RBS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; NEW_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; NEW_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
+    ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY8]]
+    ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $sgpr3
+    %4:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s32) = COPY $vgpr0
+    %6:_(s32) = COPY $vgpr1
+    %7:_(s32) = COPY $vgpr2
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(<4 x s32>) = G_AMDGPU_BUFFER_LOAD %4(<4 x s32>), %9(s32), %5, %9, 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+    %11:_(s32) = G_CONSTANT i32 1
+    %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32) = G_UNMERGE_VALUES %10(<4 x s32>)
+    %16:_(s32) = G_ADD %13, %11
+    G_STORE %16(s32), %8(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_and_i64
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; OLD_RBS-LABEL: name: vgpr_and_i64
+    ; OLD_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; OLD_RBS-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
+    ; OLD_RBS-NEXT: [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; OLD_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64)
+    ; OLD_RBS-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64)
+    ; OLD_RBS-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; OLD_RBS-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; OLD_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    ; OLD_RBS-NEXT: G_STORE [[MV3]](s64), [[MV2]](p1) :: (store (s64), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: vgpr_and_i64
+    ; NEW_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; NEW_RBS-NEXT: [[MV1:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
+    ; NEW_RBS-NEXT: [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; NEW_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64)
+    ; NEW_RBS-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64)
+    ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]]
+    ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]]
+    ; NEW_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
+    ; NEW_RBS-NEXT: G_STORE [[MV3]](s64), [[MV2]](p1) :: (store (s64), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s64) = G_AND %2, %5
+    G_STORE %9(s64), %8(p1) :: (store (s64), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: abs_sgpr_i16
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: abs_sgpr_i16
+    ; OLD_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; OLD_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
+    ; OLD_RBS-NEXT: [[ABS:%[0-9]+]]:sgpr(s32) = G_ABS [[SEXT]]
+    ; OLD_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ABS]](s32)
+    ; OLD_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16)
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ANYEXT]](s32)
+    ; OLD_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s16), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: abs_sgpr_i16
+    ; NEW_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16)
+    ; NEW_RBS-NEXT: [[ABS:%[0-9]+]]:sgpr(s32) = G_ABS [[SEXT]]
+    ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ABS]](s32)
+    ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16)
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ANYEXT]](s32)
+    ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s16), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s16) = G_ABS %1
+    %6:_(s32) = G_ANYEXT %5(s16)
+    G_STORE %6(s32), %4(p1) :: (store (s16), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: uniform_i1_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; OLD_RBS-LABEL: name: uniform_i1_phi
+  ; OLD_RBS: bb.0:
+  ; OLD_RBS-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; OLD_RBS-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; OLD_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; OLD_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; OLD_RBS-NEXT:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+  ; OLD_RBS-NEXT:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+  ; OLD_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+  ; OLD_RBS-NEXT:   [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; OLD_RBS-NEXT:   [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+  ; OLD_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT:   [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; OLD_RBS-NEXT:   [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32)
+  ; OLD_RBS-NEXT:   [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1)
+  ; OLD_RBS-NEXT:   [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+  ; OLD_RBS-NEXT:   G_BRCOND [[ZEXT]](s32), %bb.2
+  ; OLD_RBS-NEXT:   G_BR %bb.1
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.1:
+  ; OLD_RBS-NEXT:   successors: %bb.2(0x80000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT:   [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; OLD_RBS-NEXT:   [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32)
+  ; OLD_RBS-NEXT:   [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s1)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.2:
+  ; OLD_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1
+  ; OLD_RBS-NEXT:   [[TRUNC3:%[0-9]+]]:sgpr(s1) = G_TRUNC [[PHI]](s32)
+  ; OLD_RBS-NEXT:   [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC3]](s1)
+  ; OLD_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; OLD_RBS-NEXT:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]]
+  ; OLD_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+  ; OLD_RBS-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   S_ENDPGM 0
+  ;
+  ; NEW_RBS-LABEL: name: uniform_i1_phi
+  ; NEW_RBS: bb.0:
+  ; NEW_RBS-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; NEW_RBS-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; NEW_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; NEW_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; NEW_RBS-NEXT:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+  ; NEW_RBS-NEXT:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+  ; NEW_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+  ; NEW_RBS-NEXT:   [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; NEW_RBS-NEXT:   [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+  ; NEW_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT:   [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; NEW_RBS-NEXT:   [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32)
+  ; NEW_RBS-NEXT:   [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1)
+  ; NEW_RBS-NEXT:   [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+  ; NEW_RBS-NEXT:   G_BRCOND [[ZEXT]](s32), %bb.2
+  ; NEW_RBS-NEXT:   G_BR %bb.1
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.1:
+  ; NEW_RBS-NEXT:   successors: %bb.2(0x80000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT:   [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; NEW_RBS-NEXT:   [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32)
+  ; NEW_RBS-NEXT:   [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s1)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.2:
+  ; NEW_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1
+  ; NEW_RBS-NEXT:   [[TRUNC3:%[0-9]+]]:sgpr(s1) = G_TRUNC [[PHI]](s32)
+  ; NEW_RBS-NEXT:   [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC3]](s1)
+  ; NEW_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; NEW_RBS-NEXT:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]]
+  ; NEW_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
+  ; NEW_RBS-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $sgpr0
+    %4:_(s32) = COPY $sgpr1
+    %5:_(s32) = G_CONSTANT i32 6
+    %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(s1) = G_ICMP intpred(ne), %4(s32), %7
+    G_BRCOND %8(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %9:_(s32) = G_CONSTANT i32 1
+    %10:_(s1) = G_ICMP intpred(ult), %3(s32), %9
+
+  bb.2:
+    %11:_(s1) = G_PHI %6(s1), %bb.0, %10(s1), %bb.1
+    %12:_(s32) = G_SEXT %11(s1)
+    %13:_(s32) = G_CONSTANT i32 2
+    %14:_(s32) = G_ADD %12, %13
+    G_STORE %14(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vcc_to_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: vcc_to_scc
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; OLD_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; OLD_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]]
+    ; OLD_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; OLD_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; OLD_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY7]], [[COPY8]]
+    ; OLD_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: vcc_to_scc
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; NEW_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]]
+    ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
+    ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY7]], [[COPY8]]
+    ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s32) = COPY $vgpr0
+    %4:_(s32) = COPY $vgpr1
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_FCONSTANT float 0.000000e+00
+    %7:_(s1) = G_FCMP floatpred(oeq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: scc_to_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; OLD_RBS-LABEL: name: scc_to_vcc
+    ; OLD_RBS: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; OLD_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; OLD_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]]
+    ; OLD_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: scc_to_vcc
+    ; NEW_RBS: liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+    ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]]
+    ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:_(s1) = G_ICMP intpred(eq), %0(s32), %6
+    %8:_(s32) = G_SELECT %7(s1), %1, %2
+    G_STORE %8(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: vgpr_to_vcc_trunc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; OLD_RBS-LABEL: name: vgpr_to_vcc_trunc
+    ; OLD_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; OLD_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]]
+    ; OLD_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: vgpr_to_vcc_trunc
+    ; NEW_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]]
+    ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(s32) = COPY $vgpr4
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s1) = G_TRUNC %0(s32)
+    %7:_(s32) = G_SELECT %6(s1), %1, %2
+    G_STORE %7(s32), %5(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: zext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: zext
+    ; OLD_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; OLD_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; OLD_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ZEXT]](s32)
+    ; OLD_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: zext
+    ; NEW_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ZEXT]](s32)
+    ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_ZEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: sext
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: sext
+    ; OLD_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; OLD_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; OLD_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SEXT]](s32)
+    ; OLD_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: sext
+    ; NEW_RBS: liveins: $sgpr0, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1)
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SEXT]](s32)
+    ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 10
+    %5:_(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:_(s32) = G_SEXT %5(s1)
+    G_STORE %6(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_vcc
+legalized: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; OLD_RBS-LABEL: name: and_i1_vcc
+    ; OLD_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; OLD_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY4]]
+    ; OLD_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20
+    ; OLD_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; OLD_RBS-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY5]]
+    ; OLD_RBS-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; OLD_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; OLD_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: and_i1_vcc
+    ; NEW_RBS: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+    ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY4]]
+    ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20
+    ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+    ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY5]]
+    ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
+    ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]]
+    ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: and_i1_scc
+legalized: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+
+    ; OLD_RBS-LABEL: name: and_i1_scc
+    ; OLD_RBS: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; OLD_RBS-NEXT: {{  $}}
+    ; OLD_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; OLD_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; OLD_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; OLD_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; OLD_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; OLD_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; OLD_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; OLD_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; OLD_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20
+    ; OLD_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; OLD_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32)
+    ; OLD_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; OLD_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s1)
+    ; OLD_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ANYEXT]], [[ANYEXT1]]
+    ; OLD_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[AND]](s32)
+    ; OLD_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC2]](s1)
+    ; OLD_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[COPY]], [[COPY1]]
+    ; OLD_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
+    ; OLD_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; OLD_RBS-NEXT: S_ENDPGM 0
+    ;
+    ; NEW_RBS-LABEL: name: and_i1_scc
+    ; NEW_RBS: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1
+    ; NEW_RBS-NEXT: {{  $}}
+    ; NEW_RBS-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; NEW_RBS-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; NEW_RBS-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]]
+    ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20
+    ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]]
+    ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32)
+    ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1)
+    ; NEW_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s1)
+    ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ANYEXT]], [[ANYEXT1]]
+    ; NEW_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[AND]](s32)
+    ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC2]](s1)
+    ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[COPY]], [[COPY1]]
+    ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32)
+    ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+    ; NEW_RBS-NEXT: S_ENDPGM 0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = COPY $vgpr1
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 10
+    %6:_(s1) = G_ICMP intpred(uge), %0(s32), %5
+    %7:_(s32) = G_CONSTANT i32 20
+    %8:_(s1) = G_ICMP intpred(uge), %1(s32), %7
+    %9:_(s1) = G_AND %6, %8
+    %10:_(s32) = G_SELECT %9(s1), %0, %1
+    G_STORE %10(s32), %4(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_phi_with_uniform_inputs
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; OLD_RBS-LABEL: name: divergent_phi_with_uniform_inputs
+  ; OLD_RBS: bb.0:
+  ; OLD_RBS-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; OLD_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; OLD_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; OLD_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; OLD_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; OLD_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+  ; OLD_RBS-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]]
+  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   G_BR %bb.1
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.1:
+  ; OLD_RBS-NEXT:   successors: %bb.2(0x80000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.2:
+  ; OLD_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1
+  ; OLD_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; OLD_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[PHI]](s32)
+  ; OLD_RBS-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   S_ENDPGM 0
+  ;
+  ; NEW_RBS-LABEL: name: divergent_phi_with_uniform_inputs
+  ; NEW_RBS: bb.0:
+  ; NEW_RBS-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; NEW_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; NEW_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; NEW_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; NEW_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; NEW_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+  ; NEW_RBS-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]]
+  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   G_BR %bb.1
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.1:
+  ; NEW_RBS-NEXT:   successors: %bb.2(0x80000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.2:
+  ; NEW_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1
+  ; NEW_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; NEW_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[PHI]](s32)
+  ; NEW_RBS-NEXT:   G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %4
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %7:_(s32) = G_CONSTANT i32 1
+
+  bb.2:
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %6(s32)
+    G_STORE %8(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_because_of_temporal_divergent_use
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; OLD_RBS-LABEL: name: divergent_because_of_temporal_divergent_use
+  ; OLD_RBS: bb.0:
+  ; OLD_RBS-NEXT:   successors: %bb.1(0x80000000)
+  ; OLD_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; OLD_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; OLD_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; OLD_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; OLD_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+  ; OLD_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.1:
+  ; OLD_RBS-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; OLD_RBS-NEXT:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
+  ; OLD_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+  ; OLD_RBS-NEXT:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI1]], [[COPY3]]
+  ; OLD_RBS-NEXT:   [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[ADD]](s32)
+  ; OLD_RBS-NEXT:   [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; OLD_RBS-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; OLD_RBS-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   G_BR %bb.2
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.2:
+  ; OLD_RBS-NEXT:   [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD]](s32), %bb.1
+  ; OLD_RBS-NEXT:   [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.1
+  ; OLD_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; OLD_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+  ; OLD_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32)
+  ; OLD_RBS-NEXT:   [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY4]]
+  ; OLD_RBS-NEXT:   G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   S_ENDPGM 0
+  ;
+  ; NEW_RBS-LABEL: name: divergent_because_of_temporal_divergent_use
+  ; NEW_RBS: bb.0:
+  ; NEW_RBS-NEXT:   successors: %bb.1(0x80000000)
+  ; NEW_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; NEW_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; NEW_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; NEW_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; NEW_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
+  ; NEW_RBS-NEXT:   [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.1:
+  ; NEW_RBS-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; NEW_RBS-NEXT:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
+  ; NEW_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+  ; NEW_RBS-NEXT:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI1]], [[COPY3]]
+  ; NEW_RBS-NEXT:   [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[ADD]](s32)
+  ; NEW_RBS-NEXT:   [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; NEW_RBS-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; NEW_RBS-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   G_BR %bb.2
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.2:
+  ; NEW_RBS-NEXT:   [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD]](s32), %bb.1
+  ; NEW_RBS-NEXT:   [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.1
+  ; NEW_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; NEW_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+  ; NEW_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32)
+  ; NEW_RBS-NEXT:   [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY4]]
+  ; NEW_RBS-NEXT:   G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 -1
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %4(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %10
+    %11:_(s32) = G_UITOFP %9(s32)
+    %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %13:_(s32) = G_PHI %9(s32), %bb.1
+    %14:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    %15:_(s32) = G_CONSTANT i32 10
+    %16:_(s32) = G_MUL %13, %15
+    G_STORE %16(s32), %3(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_2breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; OLD_RBS-LABEL: name: loop_with_2breaks
+  ; OLD_RBS: bb.0:
+  ; OLD_RBS-NEXT:   successors: %bb.1(0x80000000)
+  ; OLD_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; OLD_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; OLD_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; OLD_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; OLD_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; OLD_RBS-NEXT:   [[MV1:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; OLD_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+  ; OLD_RBS-NEXT:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
+  ; OLD_RBS-NEXT:   [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; OLD_RBS-NEXT:   [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+  ; OLD_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.1:
+  ; OLD_RBS-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3
+  ; OLD_RBS-NEXT:   [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; OLD_RBS-NEXT:   [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; OLD_RBS-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; OLD_RBS-NEXT:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[PHI2]](s32)
+  ; OLD_RBS-NEXT:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31
+  ; OLD_RBS-NEXT:   [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[COPY7]], [[C1]](s32)
+  ; OLD_RBS-NEXT:   [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY7]](s32), [[ASHR]](s32)
+  ; OLD_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; OLD_RBS-NEXT:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+  ; OLD_RBS-NEXT:   [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY8]](s32)
+  ; OLD_RBS-NEXT:   [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; OLD_RBS-NEXT:   [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32)
+  ; OLD_RBS-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY9]]
+  ; OLD_RBS-NEXT:   [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT:   [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C4]](s32)
+  ; OLD_RBS-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC]](s1)
+  ; OLD_RBS-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   G_BR %bb.2
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.2:
+  ; OLD_RBS-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; OLD_RBS-NEXT:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32)
+  ; OLD_RBS-NEXT:   [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY12]](s32)
+  ; OLD_RBS-NEXT:   [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; OLD_RBS-NEXT:   [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; OLD_RBS-NEXT:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32)
+  ; OLD_RBS-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]]
+  ; OLD_RBS-NEXT:   [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT:   [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C7]](s32)
+  ; OLD_RBS-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC1]](s1)
+  ; OLD_RBS-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
+  ; OLD_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   G_BR %bb.4
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.3:
+  ; OLD_RBS-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5
+  ; OLD_RBS-NEXT:   [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; OLD_RBS-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; OLD_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; OLD_RBS-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI1]](s32)
+  ; OLD_RBS-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   G_BR %bb.6
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.4:
+  ; OLD_RBS-NEXT:   successors: %bb.5(0x80000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; OLD_RBS-NEXT:   [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32)
+  ; OLD_RBS-NEXT:   [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY17]](s32)
+  ; OLD_RBS-NEXT:   [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+  ; OLD_RBS-NEXT:   [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; OLD_RBS-NEXT:   [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32)
+  ; OLD_RBS-NEXT:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY18]]
+  ; OLD_RBS-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+  ; OLD_RBS-NEXT:   [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32)
+  ; OLD_RBS-NEXT:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY19]]
+  ; OLD_RBS-NEXT:   [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100
+  ; OLD_RBS-NEXT:   [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32)
+  ; OLD_RBS-NEXT:   [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY20]]
+  ; OLD_RBS-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; OLD_RBS-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.5:
+  ; OLD_RBS-NEXT:   successors: %bb.3(0x80000000)
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; OLD_RBS-NEXT:   [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; OLD_RBS-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; OLD_RBS-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[COPY22]](s1)
+  ; OLD_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; OLD_RBS-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; OLD_RBS-NEXT:   G_BR %bb.3
+  ; OLD_RBS-NEXT: {{  $}}
+  ; OLD_RBS-NEXT: bb.6:
+  ; OLD_RBS-NEXT:   [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.3
+  ; OLD_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; OLD_RBS-NEXT:   S_ENDPGM 0
+  ;
+  ; NEW_RBS-LABEL: name: loop_with_2breaks
+  ; NEW_RBS: bb.0:
+  ; NEW_RBS-NEXT:   successors: %bb.1(0x80000000)
+  ; NEW_RBS-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+  ; NEW_RBS-NEXT:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+  ; NEW_RBS-NEXT:   [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; NEW_RBS-NEXT:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+  ; NEW_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; NEW_RBS-NEXT:   [[MV1:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; NEW_RBS-NEXT:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
+  ; NEW_RBS-NEXT:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
+  ; NEW_RBS-NEXT:   [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; NEW_RBS-NEXT:   [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
+  ; NEW_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.1:
+  ; NEW_RBS-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3
+  ; NEW_RBS-NEXT:   [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; NEW_RBS-NEXT:   [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; NEW_RBS-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; NEW_RBS-NEXT:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[PHI2]](s32)
+  ; NEW_RBS-NEXT:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31
+  ; NEW_RBS-NEXT:   [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[COPY7]], [[C1]](s32)
+  ; NEW_RBS-NEXT:   [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY7]](s32), [[ASHR]](s32)
+  ; NEW_RBS-NEXT:   [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; NEW_RBS-NEXT:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
+  ; NEW_RBS-NEXT:   [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY8]](s32)
+  ; NEW_RBS-NEXT:   [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; NEW_RBS-NEXT:   [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32)
+  ; NEW_RBS-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY9]]
+  ; NEW_RBS-NEXT:   [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT:   [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C4]](s32)
+  ; NEW_RBS-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC]](s1)
+  ; NEW_RBS-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   G_BR %bb.2
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.2:
+  ; NEW_RBS-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; NEW_RBS-NEXT:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32)
+  ; NEW_RBS-NEXT:   [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY12]](s32)
+  ; NEW_RBS-NEXT:   [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; NEW_RBS-NEXT:   [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
+  ; NEW_RBS-NEXT:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32)
+  ; NEW_RBS-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]]
+  ; NEW_RBS-NEXT:   [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT:   [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C7]](s32)
+  ; NEW_RBS-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC1]](s1)
+  ; NEW_RBS-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
+  ; NEW_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   G_BR %bb.4
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.3:
+  ; NEW_RBS-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5
+  ; NEW_RBS-NEXT:   [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; NEW_RBS-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; NEW_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; NEW_RBS-NEXT:   [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI1]](s32)
+  ; NEW_RBS-NEXT:   SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   G_BR %bb.6
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.4:
+  ; NEW_RBS-NEXT:   successors: %bb.5(0x80000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+  ; NEW_RBS-NEXT:   [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32)
+  ; NEW_RBS-NEXT:   [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY17]](s32)
+  ; NEW_RBS-NEXT:   [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+  ; NEW_RBS-NEXT:   [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+  ; NEW_RBS-NEXT:   [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32)
+  ; NEW_RBS-NEXT:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY18]]
+  ; NEW_RBS-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+  ; NEW_RBS-NEXT:   [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32)
+  ; NEW_RBS-NEXT:   [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY19]]
+  ; NEW_RBS-NEXT:   [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100
+  ; NEW_RBS-NEXT:   [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32)
+  ; NEW_RBS-NEXT:   [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY20]]
+  ; NEW_RBS-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; NEW_RBS-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.5:
+  ; NEW_RBS-NEXT:   successors: %bb.3(0x80000000)
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; NEW_RBS-NEXT:   [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; NEW_RBS-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; NEW_RBS-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[COPY22]](s1)
+  ; NEW_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; NEW_RBS-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; NEW_RBS-NEXT:   G_BR %bb.3
+  ; NEW_RBS-NEXT: {{  $}}
+  ; NEW_RBS-NEXT: bb.6:
+  ; NEW_RBS-NEXT:   [[PHI7:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.3
+  ; NEW_RBS-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; NEW_RBS-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_IMPLICIT_DEF
+    %10:_(s32) = G_CONSTANT i32 0
+    %11:sreg_32(s1) = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %12:sreg_32(s1) = PHI %11(s1), %bb.0, %13(s1), %bb.3
+    %14:_(s32) = G_PHI %15(s32), %bb.3, %10(s32), %bb.0
+    %16:_(s32) = G_PHI %10(s32), %bb.0, %17(s32), %bb.3
+    %18:sreg_32(s1) = COPY %12(s1)
+    %19:_(s64) = G_SEXT %16(s32)
+    %20:_(s32) = G_CONSTANT i32 2
+    %21:_(s64) = G_SHL %19, %20(s32)
+    %22:_(p1) = G_PTR_ADD %5, %21(s64)
+    %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
+    %24:_(s32) = G_CONSTANT i32 0
+    %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
+    %26:_(s1) = G_CONSTANT i1 true
+    %27:sreg_32(s1) = COPY %26(s1)
+    %28:sreg_32(s1) = S_ANDN2_B32 %18(s1), $exec_lo, implicit-def $scc
+    %29:sreg_32(s1) = S_AND_B32 $exec_lo, %27(s1), implicit-def $scc
+    %30:sreg_32(s1) = S_OR_B32 %28(s1), %29(s1), implicit-def $scc
+    %31:sreg_32(s1) = COPY %30(s1)
+    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %33:_(s32) = G_CONSTANT i32 2
+    %34:_(s64) = G_SHL %19, %33(s32)
+    %35:_(p1) = G_PTR_ADD %8, %34(s64)
+    %36:_(s32) = G_LOAD %35(p1) :: (load (s32), addrspace 1)
+    %37:_(s32) = G_CONSTANT i32 0
+    %38:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %36(s32), %37
+    %39:_(s1) = G_CONSTANT i1 true
+    %40:sreg_32(s1) = COPY %39(s1)
+    %41:sreg_32(s1) = COPY %40(s1)
+    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %13:sreg_32(s1) = PHI %30(s1), %bb.1, %43(s1), %bb.5
+    %17:_(s32) = G_PHI %44(s32), %bb.5, %9(s32), %bb.1
+    %45:sreg_32(s1) = COPY %13(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %32(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), %45(s1), %14(s32)
+    SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %46:_(s32) = G_CONSTANT i32 2
+    %47:_(s64) = G_SHL %19, %46(s32)
+    %48:_(p1) = G_PTR_ADD %2, %47(s64)
+    %49:_(s32) = G_LOAD %48(p1) :: (load (s32), addrspace 1)
+    %50:_(s32) = G_CONSTANT i32 1
+    %51:_(s32) = G_ADD %49, %50
+    G_STORE %51(s32), %48(p1) :: (store (s32), addrspace 1)
+    %52:_(s32) = G_ADD %16, %50
+    %53:_(s32) = G_CONSTANT i32 100
+    %54:_(s1) = G_ICMP intpred(ult), %16(s32), %53
+    %55:sreg_32(s1) = COPY %54(s1)
+    %56:sreg_32(s1) = S_ANDN2_B32 %41(s1), $exec_lo, implicit-def $scc
+    %57:sreg_32(s1) = S_AND_B32 $exec_lo, %55(s1), implicit-def $scc
+    %58:sreg_32(s1) = S_OR_B32 %56(s1), %57(s1), implicit-def $scc
+
+  bb.5:
+    successors: %bb.3(0x80000000)
+
+    %59:sreg_32(s1) = PHI %40(s1), %bb.2, %58(s1), %bb.4
+    %44:_(s32) = G_PHI %52(s32), %bb.4, %9(s32), %bb.2
+    %60:sreg_32(s1) = COPY %59(s1)
+    %61:sreg_32(s1) = COPY %60(s1)
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %42(s32)
+    %62:sreg_32(s1) = S_ANDN2_B32 %31(s1), $exec_lo, implicit-def $scc
+    %63:sreg_32(s1) = S_AND_B32 $exec_lo, %61(s1), implicit-def $scc
+    %43:sreg_32(s1) = S_OR_B32 %62(s1), %63(s1), implicit-def $scc
+    G_BR %bb.3
+
+  bb.6:
+    %64:_(s32) = G_PHI %15(s32), %bb.3
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %64(s32)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 65c44768d3d88..6c62f3f225cd9 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -27,10 +27,10 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; OPT-NEXT:    [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]])
 ; OPT-NEXT:    [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]])
-; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]])
 ; OPT-NEXT:    br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]]
 ; OPT:       Flow1:
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
+; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]])
 ; OPT-NEXT:    [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
 ; OPT-NEXT:    br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]]
 ; OPT:       IF:
@@ -57,33 +57,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN-NEXT:  .LBB0_2: ; %LOOP.outer
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GCN-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
-; GCN-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_and_b64 s[10:11], exec, s[8:9]
+; GCN-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_and_b64 s[10:11], s[6:7], exec
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  .LBB0_4: ; %LOOP
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-NEXT:  ; %bb.5: ; %ENDIF
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-NEXT:    s_branch .LBB0_3
 ; GCN-NEXT:  .LBB0_6: ; %IF
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index a52d9ff526c2a..bd6ef9e088b12 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -46,51 +46,52 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_6
 ; GCN-NEXT:  .LBB0_7: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @reduced_nested_loop_conditions(
-; IR-NEXT:  bb:
+; IR-LABEL: define amdgpu_kernel void @reduced_nested_loop_conditions(
+; IR-SAME: ptr addrspace(3) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; IR-NEXT:  [[BB:.*]]:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
-; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
+; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG]], i32 [[MY_TMP]]
 ; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8
-; IR-NEXT:    br label [[BB5:%.*]]
-; IR:       bb3:
-; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
-; IR:       bb4:
-; IR-NEXT:    br label [[FLOW:%.*]]
-; IR:       bb5:
-; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
-; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP5:%.*]], [[BB10]] ]
+; IR-NEXT:    br label %[[BB5:.*]]
+; IR:       [[BB3:.*]]:
+; IR-NEXT:    br i1 true, label %[[BB4:.*]], label %[[BB13:.*]]
+; IR:       [[BB4]]:
+; IR-NEXT:    br label %[[FLOW:.*]]
+; IR:       [[BB5]]:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], %[[BB10:.*]] ], [ 0, %[[BB]] ]
+; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB10]] ]
 ; IR-NEXT:    [[MY_TMP7:%.*]] = icmp eq i32 [[MY_TMP6]], 1
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
-; IR-NEXT:    br i1 [[TMP1]], label [[BB8:%.*]], label [[FLOW]]
-; IR:       bb8:
-; IR-NEXT:    br label [[BB13]]
-; IR:       bb9:
-; IR-NEXT:    br i1 false, label [[BB3:%.*]], label [[BB9:%.*]]
-; IR:       bb10:
+; IR-NEXT:    br i1 [[TMP1]], label %[[BB8:.*]], label %[[FLOW]]
+; IR:       [[BB8]]:
+; IR-NEXT:    br label %[[BB13]]
+; IR:       [[BB9:.*]]:
+; IR-NEXT:    br i1 false, label %[[BB3]], label %[[BB9]]
+; IR:       [[BB10]]:
 ; IR-NEXT:    [[TMP3:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]])
-; IR-NEXT:    br i1 [[TMP3]], label [[BB23:%.*]], label [[BB5]]
-; IR:       Flow:
-; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[MY_TMP22:%.*]], [[BB4]] ], [ true, [[BB5]] ]
-; IR-NEXT:    [[TMP5]] = phi i32 [ [[MY_TMP21:%.*]], [[BB4]] ], [ undef, [[BB5]] ]
+; IR-NEXT:    br i1 [[TMP3]], label %[[BB23:.*]], label %[[BB5]]
+; IR:       [[FLOW]]:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[MY_TMP22:%.*]], %[[BB4]] ], [ true, %[[BB5]] ]
+; IR-NEXT:    [[TMP5]] = phi i32 [ [[MY_TMP21:%.*]], %[[BB4]] ], [ undef, %[[BB5]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
 ; IR-NEXT:    [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN]])
-; IR-NEXT:    br label [[BB10]]
-; IR:       bb13:
-; IR-NEXT:    [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], [[BB3]] ], [ true, [[BB8]] ]
+; IR-NEXT:    br label %[[BB10]]
+; IR:       [[BB13]]:
+; IR-NEXT:    [[MY_TMP14:%.*]] = phi i1 [ [[MY_TMP22]], %[[BB3]] ], [ true, %[[BB8]] ]
 ; IR-NEXT:    [[MY_TMP15:%.*]] = bitcast i64 [[MY_TMP2]] to <2 x i32>
-; IR-NEXT:    br i1 [[MY_TMP14]], label [[BB16:%.*]], label [[BB20:%.*]]
-; IR:       bb16:
+; IR-NEXT:    br i1 [[MY_TMP14]], label %[[BB16:.*]], label %[[BB20:.*]]
+; IR:       [[BB16]]:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
 ; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(3) undef, i32 [[MY_TMP17]]
 ; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[MY_TMP18]], align 4
-; IR-NEXT:    br label [[BB20]]
-; IR:       bb20:
-; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
-; IR-NEXT:    [[MY_TMP22]] = phi i1 [ false, [[BB16]] ], [ [[MY_TMP14]], [[BB13]] ]
-; IR-NEXT:    br label [[BB9]]
-; IR:       bb23:
+; IR-NEXT:    br label %[[BB20]]
+; IR:       [[BB20]]:
+; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], %[[BB16]] ], [ 0, %[[BB13]] ]
+; IR-NEXT:    [[MY_TMP22]] = phi i1 [ false, %[[BB16]] ], [ [[MY_TMP14]], %[[BB13]] ]
+; IR-NEXT:    br label %[[BB9]]
+; IR:       [[BB23]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
 bb:
@@ -188,66 +189,67 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @nested_loop_conditions(
-; IR-NEXT:  bb:
+; IR-LABEL: define amdgpu_kernel void @nested_loop_conditions(
+; IR-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; IR-NEXT:  [[BB:.*]]:
 ; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
-; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
-; IR:       bb14.lr.ph:
+; IR-NEXT:    br i1 [[MY_TMP1235]], label %[[BB14_LR_PH:.*]], label %[[FLOW:.*]]
+; IR:       [[BB14_LR_PH]]:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
-; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG:%.*]], i64 [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, ptr addrspace(1) [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, ptr addrspace(1) undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    br label [[BB14:%.*]]
-; IR:       Flow3:
+; IR-NEXT:    br label %[[BB14:.*]]
+; IR:       [[FLOW3:.*]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
-; IR-NEXT:    br i1 [[TMP1]], label [[BB4_BB13_CRIT_EDGE:%.*]], label [[FLOW4:%.*]]
-; IR:       bb4.bb13_crit_edge:
-; IR-NEXT:    br label [[FLOW4]]
-; IR:       Flow4:
-; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ true, [[BB4_BB13_CRIT_EDGE]] ], [ false, [[FLOW3:%.*]] ]
+; IR-NEXT:    br i1 [[TMP1]], label %[[BB4_BB13_CRIT_EDGE:.*]], label %[[FLOW4:.*]]
+; IR:       [[BB4_BB13_CRIT_EDGE]]:
+; IR-NEXT:    br label %[[FLOW4]]
+; IR:       [[FLOW4]]:
+; IR-NEXT:    [[TMP3:%.*]] = phi i1 [ true, %[[BB4_BB13_CRIT_EDGE]] ], [ false, %[[FLOW3]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
-; IR-NEXT:    br label [[FLOW]]
-; IR:       bb13:
-; IR-NEXT:    br label [[BB31:%.*]]
-; IR:       Flow:
-; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], [[FLOW4]] ], [ true, [[BB:%.*]] ]
+; IR-NEXT:    br label %[[FLOW]]
+; IR:       [[BB13:.*]]:
+; IR-NEXT:    br label %[[BB31:.*]]
+; IR:       [[FLOW]]:
+; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], %[[FLOW4]] ], [ true, %[[BB]] ]
 ; IR-NEXT:    [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]])
 ; IR-NEXT:    [[TMP6:%.*]] = extractvalue { i1, i64 } [[TMP5]], 0
 ; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP5]], 1
-; IR-NEXT:    br i1 [[TMP6]], label [[BB13:%.*]], label [[BB31]]
-; IR:       bb14:
-; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP16:%.*]], [[FLOW1:%.*]] ], [ 0, [[BB14_LR_PH]] ]
-; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], [[BB14_LR_PH]] ], [ [[TMP12:%.*]], [[FLOW1]] ]
-; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], [[BB14_LR_PH]] ], [ [[TMP11:%.*]], [[FLOW1]] ]
+; IR-NEXT:    br i1 [[TMP6]], label %[[BB13]], label %[[BB31]]
+; IR:       [[BB14]]:
+; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP16:%.*]], %[[FLOW1:.*]] ], [ 0, %[[BB14_LR_PH]] ]
+; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], %[[BB14_LR_PH]] ], [ [[TMP12:%.*]], %[[FLOW1]] ]
+; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], %[[BB14_LR_PH]] ], [ [[TMP11:%.*]], %[[FLOW1]] ]
 ; IR-NEXT:    [[MY_TMP15:%.*]] = icmp eq i32 [[MY_TMP1037]], 1
 ; IR-NEXT:    [[TMP8:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP15]])
 ; IR-NEXT:    [[TMP9:%.*]] = extractvalue { i1, i64 } [[TMP8]], 0
 ; IR-NEXT:    [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP8]], 1
-; IR-NEXT:    br i1 [[TMP9]], label [[BB16:%.*]], label [[FLOW1]]
-; IR:       bb16:
+; IR-NEXT:    br i1 [[TMP9]], label %[[BB16:.*]], label %[[FLOW1]]
+; IR:       [[BB16]]:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = bitcast i64 [[MY_TMP3]] to <2 x i32>
-; IR-NEXT:    br label [[BB18:%.*]]
-; IR:       Flow1:
-; IR-NEXT:    [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], [[BB21:%.*]] ], [ undef, [[BB14]] ]
-; IR-NEXT:    [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], [[BB21]] ], [ undef, [[BB14]] ]
-; IR-NEXT:    [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], [[BB21]] ], [ true, [[BB14]] ]
-; IR-NEXT:    [[TMP14]] = phi i1 [ [[MY_TMP12]], [[BB21]] ], [ false, [[BB14]] ]
-; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ false, [[BB21]] ], [ true, [[BB14]] ]
+; IR-NEXT:    br label %[[BB18:.*]]
+; IR:       [[FLOW1]]:
+; IR-NEXT:    [[TMP11]] = phi <4 x i32> [ [[MY_TMP9:%.*]], %[[BB21:.*]] ], [ undef, %[[BB14]] ]
+; IR-NEXT:    [[TMP12]] = phi i32 [ [[MY_TMP10:%.*]], %[[BB21]] ], [ undef, %[[BB14]] ]
+; IR-NEXT:    [[TMP13:%.*]] = phi i1 [ [[MY_TMP12:%.*]], %[[BB21]] ], [ true, %[[BB14]] ]
+; IR-NEXT:    [[TMP14]] = phi i1 [ [[MY_TMP12]], %[[BB21]] ], [ false, %[[BB14]] ]
+; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ false, %[[BB21]] ], [ true, %[[BB14]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
 ; IR-NEXT:    [[TMP16]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP13]], i64 [[PHI_BROKEN]])
 ; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
-; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
-; IR:       bb18:
+; IR-NEXT:    br i1 [[TMP17]], label %[[FLOW2:.*]], label %[[BB14]]
+; IR:       [[BB18]]:
 ; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
-; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
-; IR:       bb21:
+; IR-NEXT:    br i1 [[MY_TMP20]], label %[[BB21]], label %[[BB18]]
+; IR:       [[BB21]]:
 ; IR-NEXT:    [[MY_TMP22:%.*]] = extractelement <2 x i32> [[MY_TMP17]], i64 1
 ; IR-NEXT:    [[MY_TMP23:%.*]] = lshr i32 [[MY_TMP22]], 16
 ; IR-NEXT:    [[MY_TMP24:%.*]] = select i1 undef, i32 undef, i32 [[MY_TMP23]]
@@ -263,16 +265,16 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
 ; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
-; IR-NEXT:    br label [[FLOW1]]
-; IR:       Flow2:
+; IR-NEXT:    br label %[[FLOW1]]
+; IR:       [[FLOW2]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
 ; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
 ; IR-NEXT:    [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
 ; IR-NEXT:    [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
-; IR-NEXT:    br i1 [[TMP19]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
-; IR:       bb31.loopexit:
-; IR-NEXT:    br label [[FLOW3]]
-; IR:       bb31:
+; IR-NEXT:    br i1 [[TMP19]], label %[[BB31_LOOPEXIT:.*]], label %[[FLOW3]]
+; IR:       [[BB31_LOOPEXIT]]:
+; IR-NEXT:    br label %[[FLOW3]]
+; IR:       [[BB31]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index a1a466fb04440..384a2c63122b8 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -4074,14 +4074,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_not_b32_e32 v2, 31
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    s_and_b32 s0, 0xffff, s0
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
@@ -4191,15 +4189,12 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT:    s_and_b32 s2, 0xffff, s0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT:    s_lshl_b32 s0, s2, 16
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_add_u16_e32 v2, 0xffe0, v3
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, s0, v2
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
new file mode 100644
index 0000000000000..b5f43f9f68936
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950 %s
+define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX950-LABEL: v_ashr_pk_i8_i32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0xffffff80
+; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7f
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_ashr_i32 s1, s1, s2
+; GFX950-NEXT:    s_ashr_i32 s0, s0, s2
+; GFX950-NEXT:    v_med3_i32 v3, s0, v1, v2
+; GFX950-NEXT:    v_med3_i32 v1, s1, v1, v2
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX950-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX950-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX950-NEXT:    s_endpgm
+  %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
+  %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
+  %src2.clamp = and i32 %src2, 31
+  %insert.1 = insertelement <2 x i32> poison, i32 %src2.clamp, i64 0
+  %src2.broadcast = shufflevector <2 x i32> %insert.1, <2 x i32> poison, <2 x i32> zeroinitializer
+  %ashr = ashr <2 x i32> %build_vector, %src2.broadcast
+  %sat.low = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %ashr, <2 x i32> <i32 -128, i32 -128>)
+  %sat.hi = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sat.low, <2 x i32> <i32 127, i32 127>)
+  %trunc = trunc nsw <2 x i32> %sat.hi to <2 x i8>
+  %ret = bitcast <2 x i8> %trunc to i16
+  store i16 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX950-LABEL: v_ashr_pk_u8_i32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_ashr_i32 s1, s1, s2
+; GFX950-NEXT:    s_ashr_i32 s0, s0, s2
+; GFX950-NEXT:    v_med3_i32 v2, s0, 0, v1
+; GFX950-NEXT:    v_med3_i32 v1, s1, 0, v1
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX950-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX950-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX950-NEXT:    s_endpgm
+  %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
+  %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
+  %src2.clamp = and i32 %src2, 31
+  %insert.1 = insertelement <2 x i32> poison, i32 %src2.clamp, i64 0
+  %src2.broadcast = shufflevector <2 x i32> %insert.1, <2 x i32> poison, <2 x i32> zeroinitializer
+  %ashr = ashr <2 x i32> %build_vector, %src2.broadcast
+  %sat.low = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %ashr, <2 x i32> <i32 0, i32 0>)
+  %sat.hi = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sat.low, <2 x i32> <i32 255, i32 255>)
+  %trunc = trunc nsw <2 x i32> %sat.hi to <2 x i8>
+  %ret = bitcast <2 x i8> %trunc to i16
+  store i16 %ret, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll
index 5462240846994..edc74e393108f 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll
@@ -4,14 +4,14 @@
 ; tests for XCOFF object files.
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -xcoff-traceback-table=false \
-; RUN:     -mtriple powerpc-ibm-aix-xcoff  -data-sections=false -ppc-merge-string-pool=false \
+; RUN:     -mtriple powerpc-ibm-aix-xcoff  -data-sections=false \
 ; RUN:     -global-merge-all-const=false < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -xcoff-traceback-table=false \
-; RUN:     -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false \
+; RUN:     -mtriple powerpc64-ibm-aix-xcoff -data-sections=false \
 ; RUN:     -global-merge-all-const=false < %s | FileCheck %s
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \
-; RUN:     -xcoff-traceback-table=false -data-sections=false -ppc-merge-string-pool=false \
+; RUN:     -xcoff-traceback-table=false -data-sections=false \
 ; RUN:     -global-merge-all-const=false -filetype=obj -o %t.o < %s
 ; RUN: llvm-objdump -D %t.o | FileCheck --check-prefix=CHECKOBJ %s
 
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-rodata.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-rodata.ll
index 0fa47373964a0..9b0a3fe0a716f 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-rodata.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-rodata.ll
@@ -1,16 +1,16 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false < %s | \
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff -data-sections=false < %s | \
 ; RUN:   FileCheck --check-prefixes=CHECK,CHECK32 %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false < %s | \
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff -data-sections=false < %s | \
 ; RUN:   FileCheck --check-prefixes=CHECK,CHECK64 %s
 
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false \
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff -data-sections=false \
 ; RUN:   -filetype=obj -o %t.o < %s
 ; RUN: llvm-readobj --section-headers --file-header %t.o | \
 ; RUN:   FileCheck --check-prefixes=OBJ,OBJ32 %s
 ; RUN: llvm-readobj --syms %t.o | FileCheck --check-prefixes=SYMS,SYMS32 %s
 ; RUN: llvm-objdump -D %t.o | FileCheck --check-prefix=DIS %s
 
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false \
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff -data-sections=false \
 ; RUN:   -filetype=obj -o %t64.o < %s
 ; RUN: llvm-readobj --section-headers --file-header %t64.o | \
 ; RUN:   FileCheck --check-prefixes=OBJ,OBJ64 %s
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll
index bbcba59e2e33a..5e4784ac2904c 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll
@@ -1,4 +1,4 @@
-;; Test that the string pooling pass does not pool globals that are
+;; Test that the global merge pass does not pool globals that are
 ;; in llvm.used or in llvm.compiler.used.
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff \
diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll
index 03a830e087d26..10179bba136f9 100644
--- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll
+++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-exceptions.ll
@@ -4,7 +4,7 @@
 @id = private unnamed_addr constant [4 x i8] c"@id\00", align 1
 @id2 = private unnamed_addr constant [5 x i8] c"@id2\00", align 1
 
-; Higher-aligned dummy to make sure it is first in the string pool.
+; Higher-aligned dummy to make sure it is first in the global merge pool.
 @dummy = private unnamed_addr constant [1 x i32] [i32 42], align 4
 
 define ptr @test1() personality ptr @__gnu_objc_personality_v0 {
diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll
index a726e7741cf0f..aa0b441646fd3 100644
--- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll
+++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll
@@ -258,8 +258,8 @@ entry:
   ret void
 }
 
-; Check the contents of the TLS data and the __ModuleStringPool structure to
-; check that TLS data has been skipped during string pool merging.
+; Check the contents of the TLS data and the _MergedGlobals structure to
+; check that TLS data has been skipped during global merge.
 
 ; CHECK64: 	.csect a[TL],2
 ; CHECK64-NEXT:	.lglobl	a[TL]
diff --git a/llvm/test/CodeGen/PowerPC/pr38087.ll b/llvm/test/CodeGen/PowerPC/pr38087.ll
index 1216fa9cf8f26..933bf12cddaa6 100644
--- a/llvm/test/CodeGen/PowerPC/pr38087.ll
+++ b/llvm/test/CodeGen/PowerPC/pr38087.ll
@@ -11,9 +11,9 @@ declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0
 define void @draw_llvm_vs_variant0(<4 x float> %x) {
 ; CHECK-LABEL: draw_llvm_vs_variant0:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxsd v3, 0(r3)
-; CHECK-NEXT:    vmrghh v3, v3, v3
+; CHECK-NEXT:    lxsihzx v3, 0, r3
 ; CHECK-NEXT:    vextsh2w v3, v3
+; CHECK-NEXT:    xxmrghw v3, v3, v3
 ; CHECK-NEXT:    xvcvsxwsp vs0, v3
 ; CHECK-NEXT:    xxspltw vs0, vs0, 2
 ; CHECK-NEXT:    xvmaddasp vs0, v2, v2
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
index 7133d5c100e75..b0711d7fbc772 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll
@@ -3,12 +3,34 @@
 ; RUN:   -target-abi=ilp32d | FileCheck -check-prefixes=CHECKIFD,RV32IFD %s
 ; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d -verify-machineinstrs < %s \
 ; RUN:   -target-abi=lp64d | FileCheck -check-prefixes=CHECKIFD,RV64IFD %s
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
 
 define float @fcvt_s_d(double %a) nounwind {
 ; CHECKIFD-LABEL: fcvt_s_d:
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.s.d fa0, fa0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __truncdfsf2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __truncdfsf2
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptrunc double %a to float
   ret float %1
 }
@@ -18,6 +40,24 @@ define double @fcvt_d_s(float %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.s fa0, fa0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_s:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_s:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fpext float %a to double
   ret double %1
 }
@@ -27,6 +67,24 @@ define i32 @fcvt_w_d(double %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.w.d a0, fa0, rtz
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixdfsi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixdfsi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi double %a to i32
   ret i32 %1
 }
@@ -36,6 +94,24 @@ define i32 @fcvt_wu_d(double %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.wu.d a0, fa0, rtz
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunsdfsi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunsdfsi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui double %a to i32
   ret i32 %1
 }
@@ -60,6 +136,34 @@ define i32 @fcvt_wu_d_multiple_use(double %x, ptr %y) nounwind {
 ; RV64IFD-NEXT:    li a0, 1
 ; RV64IFD-NEXT:  .LBB4_2:
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_d_multiple_use:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunsdfsi
+; RV32I-NEXT:    bnez a0, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_d_multiple_use:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunsdfsi
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    bnez a1, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = fptoui double %x to i32
   %b = icmp eq i32 %a, 0
   %c = select i1 %b, i32 1, i32 %a
@@ -71,6 +175,25 @@ define double @fcvt_d_w(i32 %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.w fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_w:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_w:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i32 %a to double
   ret double %1
 }
@@ -81,6 +204,26 @@ define double @fcvt_d_w_load(ptr %p) nounwind {
 ; CHECKIFD-NEXT:    lw a0, 0(a0)
 ; CHECKIFD-NEXT:    fcvt.d.w fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_w_load:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    call __floatsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_w_load:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = load i32, ptr %p
   %1 = sitofp i32 %a to double
   ret double %1
@@ -91,6 +234,26 @@ define double @fcvt_d_wu(i32 %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_wu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_wu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i32 %a to double
   ret double %1
 }
@@ -107,6 +270,26 @@ define double @fcvt_d_wu_load(ptr %p) nounwind {
 ; RV64IFD-NEXT:    lwu a0, 0(a0)
 ; RV64IFD-NEXT:    fcvt.d.wu fa0, a0
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_wu_load:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    call __floatunsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_wu_load:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lwu a0, 0(a0)
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = load i32, ptr %p
   %1 = uitofp i32 %a to double
   ret double %1
@@ -126,6 +309,24 @@ define i64 @fcvt_l_d(double %a) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fcvt.l.d a0, fa0, rtz
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_l_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixdfdi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_l_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixdfdi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi double %a to i64
   ret i64 %1
 }
@@ -144,6 +345,24 @@ define i64 @fcvt_lu_d(double %a) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fcvt.lu.d a0, fa0, rtz
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_lu_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunsdfdi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_lu_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunsdfdi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui double %a to i64
   ret i64 %1
 }
@@ -164,6 +383,24 @@ define i64 @fmv_x_d(double %a, double %b) nounwind {
 ; RV64IFD-NEXT:    fadd.d fa5, fa0, fa1
 ; RV64IFD-NEXT:    fmv.x.d a0, fa5
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fmv_x_d:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fmv_x_d:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fadd double %a, %b
   %2 = bitcast double %1 to i64
   ret i64 %2
@@ -183,6 +420,24 @@ define double @fcvt_d_l(i64 %a) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fcvt.d.l fa0, a0
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_l:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatdidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_l:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i64 %a to double
   ret double %1
 }
@@ -201,6 +456,24 @@ define double @fcvt_d_lu(i64 %a) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fcvt.d.lu fa0, a0
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_lu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatundidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_lu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i64 %a to double
   ret double %1
 }
@@ -225,6 +498,24 @@ define double @fmv_d_x(i64 %a, i64 %b) nounwind {
 ; RV64IFD-NEXT:    fmv.d.x fa4, a1
 ; RV64IFD-NEXT:    fadd.d fa0, fa5, fa4
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fmv_d_x:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fmv_d_x:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = bitcast i64 %a to double
   %2 = bitcast i64 %b to double
   %3 = fadd double %1, %2
@@ -236,6 +527,24 @@ define double @fcvt_d_w_i8(i8 signext %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.w fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_w_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_w_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i8 %a to double
   ret double %1
 }
@@ -245,6 +554,24 @@ define double @fcvt_d_wu_i8(i8 zeroext %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_wu_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_wu_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i8 %a to double
   ret double %1
 }
@@ -254,6 +581,24 @@ define double @fcvt_d_w_i16(i16 signext %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.w fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_w_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_w_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i16 %a to double
   ret double %1
 }
@@ -263,6 +608,24 @@ define double @fcvt_d_wu_i16(i16 zeroext %a) nounwind {
 ; CHECKIFD:       # %bb.0:
 ; CHECKIFD-NEXT:    fcvt.d.wu fa0, a0
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_wu_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsidf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_wu_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i16 %a to double
   ret double %1
 }
@@ -281,6 +644,43 @@ define signext i32 @fcvt_d_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ; RV64IFD-NEXT:    fcvt.d.w fa5, a0
 ; RV64IFD-NEXT:    fsd fa5, 0(a1)
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_w_demanded_bits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    addi s1, a0, 1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __floatsidf
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_w_demanded_bits:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    addiw s1, a0, 1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __floatdidf
+; RV64I-NEXT:    sd a0, 0(s0)
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
   %3 = add i32 %0, 1
   %4 = sitofp i32 %3 to double
   store double %4, ptr %1, align 8
@@ -301,6 +701,44 @@ define signext i32 @fcvt_d_wu_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ; RV64IFD-NEXT:    fcvt.d.wu fa5, a0
 ; RV64IFD-NEXT:    fsd fa5, 0(a1)
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_d_wu_demanded_bits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    addi s1, a0, 1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __floatunsidf
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_d_wu_demanded_bits:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    addiw s1, a0, 1
+; RV64I-NEXT:    slli a0, s1, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    call __floatundidf
+; RV64I-NEXT:    sd a0, 0(s0)
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
   %3 = add i32 %0, 1
   %4 = uitofp i32 %3 to double
   store double %4, ptr %1, align 8
@@ -321,6 +759,28 @@ define signext i16 @fcvt_w_s_i16(double %a) nounwind {
 ; RV64IFD-NEXT:    slli a0, a0, 48
 ; RV64IFD-NEXT:    srai a0, a0, 48
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_s_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixdfsi
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a0, a0, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_s_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixdfsi
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi double %a to i16
   ret i16 %1
 }
@@ -341,6 +801,30 @@ define zeroext i16 @fcvt_wu_s_i16(double %a) nounwind {
 ; RV64IFD-NEXT:    addiw a1, a1, -1
 ; RV64IFD-NEXT:    and a0, a0, a1
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunsdfsi
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunsdfsi
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui double %a to i16
   ret i16 %1
 }
@@ -359,6 +843,28 @@ define signext i8 @fcvt_w_s_i8(double %a) nounwind {
 ; RV64IFD-NEXT:    slli a0, a0, 56
 ; RV64IFD-NEXT:    srai a0, a0, 56
 ; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_s_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixdfsi
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a0, a0, 24
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_s_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixdfsi
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi double %a to i8
   ret i8 %1
 }
@@ -369,6 +875,26 @@ define zeroext i8 @fcvt_wu_s_i8(double %a) nounwind {
 ; CHECKIFD-NEXT:    fcvt.wu.d a0, fa0, rtz
 ; CHECKIFD-NEXT:    andi a0, a0, 255
 ; CHECKIFD-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunsdfsi
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunsdfsi
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui double %a to i8
   ret i8 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
index e6df28f5f28d1..a14c06726ef5f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll
@@ -3,12 +3,34 @@
 ; RUN:   -target-abi=ilp32f | FileCheck -check-prefixes=CHECKIF,RV32IF %s
 ; RUN: llc -mtriple=riscv64 -global-isel -mattr=+f -verify-machineinstrs < %s \
 ; RUN:   -target-abi=lp64f | FileCheck -check-prefixes=CHECKIF,RV64IF %s
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
 
 define i32 @fcvt_w_s(float %a) nounwind {
 ; CHECKIF-LABEL: fcvt_w_s:
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.w.s a0, fa0, rtz
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_s:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixsfsi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_s:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixsfsi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi float %a to i32
   ret i32 %1
 }
@@ -18,6 +40,24 @@ define i32 @fcvt_wu_s(float %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.wu.s a0, fa0, rtz
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunssfsi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunssfsi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui float %a to i32
   ret i32 %1
 }
@@ -44,6 +84,34 @@ define i32 @fcvt_wu_s_multiple_use(float %x, ptr %y) nounwind {
 ; RV64IF-NEXT:    li a0, 1
 ; RV64IF-NEXT:  .LBB2_2:
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s_multiple_use:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunssfsi
+; RV32I-NEXT:    bnez a0, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s_multiple_use:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunssfsi
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    bnez a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:  .LBB2_2:
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = fptoui float %x to i32
   %b = icmp eq i32 %a, 0
   %c = select i1 %b, i32 1, i32 %a
@@ -63,6 +131,25 @@ define signext i32 @fmv_x_w(float %a, float %b) nounwind {
 ; RV64IF-NEXT:    fmv.x.w a0, fa5
 ; RV64IF-NEXT:    sext.w a0, a0
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fmv_x_w:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __addsf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fmv_x_w:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __addsf3
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
 ; Ensure fmv.x.w is generated even for a soft float calling convention
   %1 = fadd float %a, %b
   %2 = bitcast float %1 to i32
@@ -74,6 +161,25 @@ define float @fcvt_s_w(i32 %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.w fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_w:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_w:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i32 %a to float
   ret float %1
 }
@@ -84,6 +190,26 @@ define float @fcvt_s_w_load(ptr %p) nounwind {
 ; CHECKIF-NEXT:    lw a0, 0(a0)
 ; CHECKIF-NEXT:    fcvt.s.w fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_w_load:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    call __floatsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_w_load:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = load i32, ptr %p
   %1 = sitofp i32 %a to float
   ret float %1
@@ -94,6 +220,26 @@ define float @fcvt_s_wu(i32 %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_wu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_wu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i32 %a to float
   ret float %1
 }
@@ -110,6 +256,26 @@ define float @fcvt_s_wu_load(ptr %p) nounwind {
 ; RV64IF-NEXT:    lwu a0, 0(a0)
 ; RV64IF-NEXT:    fcvt.s.wu fa0, a0
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_wu_load:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    call __floatunsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_wu_load:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lwu a0, 0(a0)
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %a = load i32, ptr %p
   %1 = uitofp i32 %a to float
   ret float %1
@@ -122,6 +288,24 @@ define float @fmv_w_x(i32 %a, i32 %b) nounwind {
 ; CHECKIF-NEXT:    fmv.w.x fa4, a1
 ; CHECKIF-NEXT:    fadd.s fa0, fa5, fa4
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fmv_w_x:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __addsf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fmv_w_x:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __addsf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
 ; Ensure fmv.w.x is generated even for a soft float calling convention
   %1 = bitcast i32 %a to float
   %2 = bitcast i32 %b to float
@@ -143,6 +327,24 @@ define i64 @fcvt_l_s(float %a) nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    fcvt.l.s a0, fa0, rtz
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_l_s:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixsfdi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_l_s:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixsfdi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi float %a to i64
   ret i64 %1
 }
@@ -161,6 +363,24 @@ define i64 @fcvt_lu_s(float %a) nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_lu_s:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunssfdi
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_lu_s:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunssfdi
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui float %a to i64
   ret i64 %1
 }
@@ -179,6 +399,24 @@ define float @fcvt_s_l(i64 %a) nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    fcvt.s.l fa0, a0
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_l:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatdisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_l:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i64 %a to float
   ret float %1
 }
@@ -197,6 +435,24 @@ define float @fcvt_s_lu(i64 %a) nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    fcvt.s.lu fa0, a0
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_lu:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatundisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_lu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i64 %a to float
   ret float %1
 }
@@ -206,6 +462,24 @@ define float @fcvt_s_w_i8(i8 signext %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.w fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_w_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_w_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i8 %a to float
   ret float %1
 }
@@ -215,6 +489,24 @@ define float @fcvt_s_wu_i8(i8 zeroext %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_wu_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_wu_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i8 %a to float
   ret float %1
 }
@@ -224,6 +516,24 @@ define float @fcvt_s_w_i16(i16 signext %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.w fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_w_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_w_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = sitofp i16 %a to float
   ret float %1
 }
@@ -233,6 +543,24 @@ define float @fcvt_s_wu_i16(i16 zeroext %a) nounwind {
 ; CHECKIF:       # %bb.0:
 ; CHECKIF-NEXT:    fcvt.s.wu fa0, a0
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_wu_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __floatunsisf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_wu_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = uitofp i16 %a to float
   ret float %1
 }
@@ -252,6 +580,42 @@ define signext i32 @fcvt_s_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ; RV64IF-NEXT:    fcvt.s.w fa5, a0
 ; RV64IF-NEXT:    fsw fa5, 0(a1)
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_w_demanded_bits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    addi s1, a0, 1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __floatsisf
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_w_demanded_bits:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    addiw s1, a0, 1
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __floatdisf
+; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
   %3 = add i32 %0, 1
   %4 = sitofp i32 %3 to float
   store float %4, ptr %1, align 4
@@ -273,6 +637,43 @@ define signext i32 @fcvt_s_wu_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ; RV64IF-NEXT:    fcvt.s.wu fa5, a0
 ; RV64IF-NEXT:    fsw fa5, 0(a1)
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_s_wu_demanded_bits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    addi s1, a0, 1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __floatunsisf
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_s_wu_demanded_bits:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    addiw s1, a0, 1
+; RV64I-NEXT:    slli a0, s1, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    call __floatundisf
+; RV64I-NEXT:    sw a0, 0(s0)
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
   %3 = add i32 %0, 1
   %4 = uitofp i32 %3 to float
   store float %4, ptr %1, align 4
@@ -293,6 +694,28 @@ define signext i16 @fcvt_w_s_i16(float %a) nounwind {
 ; RV64IF-NEXT:    slli a0, a0, 48
 ; RV64IF-NEXT:    srai a0, a0, 48
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_s_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixsfsi
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a0, a0, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_s_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixsfsi
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi float %a to i16
   ret i16 %1
 }
@@ -313,6 +736,30 @@ define zeroext i16 @fcvt_wu_s_i16(float %a) nounwind {
 ; RV64IF-NEXT:    addiw a1, a1, -1
 ; RV64IF-NEXT:    and a0, a0, a1
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunssfsi
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunssfsi
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui float %a to i16
   ret i16 %1
 }
@@ -331,6 +778,28 @@ define signext i8 @fcvt_w_s_i8(float %a) nounwind {
 ; RV64IF-NEXT:    slli a0, a0, 56
 ; RV64IF-NEXT:    srai a0, a0, 56
 ; RV64IF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_w_s_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixsfsi
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a0, a0, 24
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_w_s_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixsfsi
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptosi float %a to i8
   ret i8 %1
 }
@@ -341,6 +810,26 @@ define zeroext i8 @fcvt_wu_s_i8(float %a) nounwind {
 ; CHECKIF-NEXT:    fcvt.wu.s a0, fa0, rtz
 ; CHECKIF-NEXT:    andi a0, a0, 255
 ; CHECKIF-NEXT:    ret
+;
+; RV32I-LABEL: fcvt_wu_s_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call __fixunssfsi
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fcvt_wu_s_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call __fixunssfsi
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
   %1 = fptoui float %a to i8
   ret i8 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index ae5dbfa4bf30b..ede25d2c9bb07 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -24,19 +24,20 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 2
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vadd.vi v11, v11, -15
 ; CHECK-NEXT:    vmerge.vim v13, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v14, 0
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vrgather.vv v8, v13, v12, v0.t
-; CHECK-NEXT:    vnsrl.wi v12, v14, 8
-; CHECK-NEXT:    vmsne.vi v10, v8, 0
-; CHECK-NEXT:    vrgather.vv v12, v13, v11, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v12, 0
-; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vrgather.vv v10, v13, v12, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v13, v11, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 5b01eae1ba3c0..5d307211ead6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -721,24 +721,12 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
 define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
 ; CHECK-LABEL: shuffle_v64i8_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 4112
-; CHECK-NEXT:    li a1, 240
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    addi a0, a0, 257
-; CHECK-NEXT:    vmv.s.x v14, a0
-; CHECK-NEXT:    lui a0, 98561
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vcompress.vm v12, v8, v14
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-NEXT:    addi a0, a0, -2048
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
   %s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
   ret <8 x i8> %s
@@ -810,8 +798,8 @@ define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) {
   ret <8 x i32> %out
 }
 
-define <8 x i32> @shuffle_decompress2_singlesrc_e32(<8 x i32> %v) {
-; CHECK-LABEL: shuffle_decompress2_singlesrc_e32:
+define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_spread2_singlesrc_e32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v8
@@ -823,18 +811,46 @@ define <8 x i32> @shuffle_decompress2_singlesrc_e32(<8 x i32> %v) {
   ret <8 x i32> %out
 }
 
-define <8 x i32> @shuffle_decompress3_singlesrc_e32(<8 x i32> %v) {
-; RV32-LABEL: shuffle_decompress3_singlesrc_e32:
+define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v10, v8, v8
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vwmaccu.vx v10, a0, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsrl.vi v10, v10, 1
+; CHECK-NEXT:    vadd.vi v12, v10, -1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) {
+; RV32-LABEL: shuffle_spread3_singlesrc_e32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI55_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI55_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI57_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI57_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v12, (a0)
 ; RV32-NEXT:    vrgatherei16.vv v10, v8, v12
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: shuffle_decompress3_singlesrc_e32:
+; RV64-LABEL: shuffle_spread3_singlesrc_e32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a0, 32769
 ; RV64-NEXT:    slli a0, a0, 21
@@ -849,8 +865,8 @@ define <8 x i32> @shuffle_decompress3_singlesrc_e32(<8 x i32> %v) {
 }
 
 ; TODO: This should be a single vslideup.vi
-define <8 x i32> @shuffle_decompress4_singlesrc_e32(<8 x i32> %v) {
-; CHECK-LABEL: shuffle_decompress4_singlesrc_e32:
+define <8 x i32> @shuffle_spread4_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_spread4_singlesrc_e32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
@@ -864,8 +880,8 @@ define <8 x i32> @shuffle_decompress4_singlesrc_e32(<8 x i32> %v) {
 }
 
 ; TODO: This should be either a single vslideup.vi or two widening interleaves.
-define <8 x i8> @shuffle_decompress4_singlesrc_e8(<8 x i8> %v) {
-; CHECK-LABEL: shuffle_decompress4_singlesrc_e8:
+define <8 x i8> @shuffle_spread4_singlesrc_e8(<8 x i8> %v) {
+; CHECK-LABEL: shuffle_spread4_singlesrc_e8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
@@ -877,11 +893,25 @@ define <8 x i8> @shuffle_decompress4_singlesrc_e8(<8 x i8> %v) {
   ret <8 x i8> %out
 }
 
+define <32 x i8> @shuffle_spread8_singlesrc_e8(<32 x i8> %v) {
+; CHECK-LABEL: shuffle_spread8_singlesrc_e8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsrl.vi v12, v10, 3
+; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <32 x i8> %v, <32 x i8> poison, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i8> %out
+}
+
 define <8 x i32> @shuffle_decompress_singlesrc_e32(<8 x i32> %v) {
 ; CHECK-LABEL: shuffle_decompress_singlesrc_e32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI58_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI58_0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI61_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI61_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index 9d2c722334b08..66f95b7077672 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -104,7 +104,7 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ; RV32-NEXT:    vmv.v.i v0, 10
 ; RV32-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV32-NEXT:    vslideup.vi v14, v12, 1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wx v12, v8, a0
 ; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 8
@@ -116,9 +116,8 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ; RV64-LABEL: v4i32_v16i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v0, 10
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vnsrl.wx v12, v8, a0
 ; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 27e66690d1b1e..08fd4fb85ff3f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -67,22 +67,12 @@ define void @deinterleave4_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vsll.vi v9, v9, 2
-; CHECK-NEXT:    vadd.vi v9, v9, -8
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -96,24 +86,16 @@ define void @deinterleave4_8_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, -9
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    li a0, 5
-; CHECK-NEXT:    vmadd.vx v10, a0, v9
-; CHECK-NEXT:    li a0, 34
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vcompress.vm v11, v8, v9
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
-; CHECK-NEXT:    vse8.v v11, (a1)
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
-  %shuffle.i5 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
+  %shuffle.i5 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %shuffle.i5, ptr %out, align 1
   ret void
 }
@@ -267,10 +249,12 @@ define void @deinterleave8_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 8
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
@@ -286,12 +270,14 @@ define void @deinterleave8_8_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vmv.v.i v0, -3
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 8
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 312520ae28374..847ef9a7b3601 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -441,28 +441,75 @@ entry:
 }
 
 define void @vnsrl_0_i8_single_src(ptr %in, ptr %out) {
-; CHECK-LABEL: vnsrl_0_i8_single_src:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    vse8.v v8, (a1)
-; CHECK-NEXT:    ret
+; V-LABEL: vnsrl_0_i8_single_src:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vse8.v v8, (a1)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i8_single_src:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vse8.v v8, (a1)
+; ZVE32F-NEXT:    ret
 entry:
   %0 = load <8 x i8>, ptr %in, align 1
-  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i8> %shuffle.i5, ptr %out, align 1
   ret void
 }
 
-define void @vnsrl_0_i8_single_src2(ptr %in, ptr %out) {
-; CHECK-LABEL: vnsrl_0_i8_single_src2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    vse8.v v8, (a1)
-; CHECK-NEXT:    ret
+define void @vnsrl_8_i8_single_src(ptr %in, ptr %out) {
+; V-LABEL: vnsrl_8_i8_single_src:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 8
+; V-NEXT:    vse8.v v8, (a1)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_8_i8_single_src:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 8
+; ZVE32F-NEXT:    vse8.v v8, (a1)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @vnsrl_0_i8_single_wideuse(ptr %in, ptr %out) {
+; V-LABEL: vnsrl_0_i8_single_wideuse:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vse8.v v8, (a1)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i8_single_wideuse:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vse8.v v8, (a1)
+; ZVE32F-NEXT:    ret
 entry:
   %0 = load <8 x i8>, ptr %in, align 1
   %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 41cf886c3ab75..6de846b2582da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -19,18 +19,19 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    vadd.vi v8, v12, -16
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vadd.vi v10, v12, -16
 ; CHECK-NEXT:    vadd.vi v12, v12, -15
-; CHECK-NEXT:    vnsrl.wi v10, v14, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v13, v8, 0
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vrgather.vv v10, v11, v8, v0.t
-; CHECK-NEXT:    vnsrl.wi v8, v14, 8
-; CHECK-NEXT:    vmsne.vi v10, v10, 0
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vrgather.vv v13, v11, v10, v0.t
 ; CHECK-NEXT:    vrgather.vv v8, v11, v12, v0.t
+; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    vmv.v.v v0, v10
 ; CHECK-NEXT:    ret
 %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 39a1bfcda3d83..4338d1f61af72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -106,95 +106,55 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 40
+; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    vadd.vv v24, v8, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vmseq.vi v24, v8, 0
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vadd.vi v8, v24, 1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vrgather.vv v8, v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmnot.m v6, v24
+; CHECK-NEXT:    vcompress.vm v8, v16, v24
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vcompress.vm v24, v16, v6
+; CHECK-NEXT:    vmv1r.v v12, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v24, v16, v0
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v0, v16, v13
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v24, v0
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vcompress.vm v0, v16, v12
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v12, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v28, v16
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index f20a90a422313..99743066c79a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -73,12 +73,13 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vadd.vv v16, v12, v12
-; CHECK-NEXT:    vrgather.vv v12, v8, v16
-; CHECK-NEXT:    vadd.vi v16, v16, 1
-; CHECK-NEXT:    vrgather.vv v20, v8, v16
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmseq.vi v16, v12, 0
+; CHECK-NEXT:    vcompress.vm v12, v8, v16
+; CHECK-NEXT:    vmnot.m v14, v16
+; CHECK-NEXT:    vcompress.vm v16, v8, v14
 ; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv2r.v v10, v20
+; CHECK-NEXT:    vmv2r.v v10, v16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
@@ -89,12 +90,13 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vadd.vv v24, v16, v16
-; CHECK-NEXT:    vrgather.vv v16, v8, v24
-; CHECK-NEXT:    vadd.vi v24, v24, 1
-; CHECK-NEXT:    vrgather.vv v0, v8, v24
+; CHECK-NEXT:    vand.vi v16, v16, 1
+; CHECK-NEXT:    vmseq.vi v24, v16, 0
+; CHECK-NEXT:    vcompress.vm v16, v8, v24
+; CHECK-NEXT:    vmnot.m v20, v24
+; CHECK-NEXT:    vcompress.vm v24, v8, v20
 ; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv4r.v v12, v0
+; CHECK-NEXT:    vmv4r.v v12, v24
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
 ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
@@ -180,66 +182,50 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vv v0, v8, v8
-; CHECK-NEXT:    vrgather.vv v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v8, v0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vadd.vi v8, v0, 1
-; CHECK-NEXT:    vrgather.vv v0, v24, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v24, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vand.vi v24, v16, 1
+; CHECK-NEXT:    vmseq.vi v16, v24, 0
+; CHECK-NEXT:    vcompress.vm v24, v8, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmnot.m v17, v16
+; CHECK-NEXT:    vcompress.vm v0, v8, v17
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vcompress.vm v24, v8, v17
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v20, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v4, v8
+; CHECK-NEXT:    vmv4r.v v4, v24
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -366,12 +352,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vadd.vv v16, v12, v12
-; CHECK-NEXT:    vrgather.vv v12, v8, v16
-; CHECK-NEXT:    vadd.vi v16, v16, 1
-; CHECK-NEXT:    vrgather.vv v20, v8, v16
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmseq.vi v16, v12, 0
+; CHECK-NEXT:    vcompress.vm v12, v8, v16
+; CHECK-NEXT:    vmnot.m v14, v16
+; CHECK-NEXT:    vcompress.vm v16, v8, v14
 ; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv2r.v v10, v20
+; CHECK-NEXT:    vmv2r.v v10, v16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
 ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
@@ -436,66 +423,50 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vv v0, v8, v8
-; CHECK-NEXT:    vrgather.vv v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v8, v0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vadd.vi v8, v0, 1
-; CHECK-NEXT:    vrgather.vv v0, v24, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v24, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vand.vi v24, v16, 1
+; CHECK-NEXT:    vmseq.vi v16, v24, 0
+; CHECK-NEXT:    vcompress.vm v24, v8, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmnot.m v17, v16
+; CHECK-NEXT:    vcompress.vm v0, v8, v17
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vcompress.vm v24, v8, v17
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v20, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v4, v8
+; CHECK-NEXT:    vmv4r.v v4, v24
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/vector-tuple-zeroinitializer.ll b/llvm/test/CodeGen/RISCV/vector-tuple-zeroinitializer.ll
new file mode 100644
index 0000000000000..fb1104e0a3b80
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vector-tuple-zeroinitializer.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK
+
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_tuple_zero_power_of_2() {
+; CHECK-LABEL: test_tuple_zero_power_of_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    ret
+entry:
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) zeroinitializer
+}
+
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_tuple_zero_non_power_of_2() {
+; CHECK-LABEL: test_tuple_zero_non_power_of_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    ret
+entry:
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) zeroinitializer
+}
+
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_tuple_zero_insert1(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_tuple_zero_insert1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    ret
+entry:
+  %1 = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv16i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) zeroinitializer, <vscale x 4 x i32> %a, i32 0)
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %1
+}
+
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_tuple_zero_insert2(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: test_tuple_zero_insert2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v6, 0
+; CHECK-NEXT:    vmv2r.v v10, v8
+; CHECK-NEXT:    vmv2r.v v8, v6
+; CHECK-NEXT:    ret
+entry:
+  %1 = call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv16i8_2t.nxv4i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) zeroinitializer, <vscale x 4 x i32> %a, i32 1)
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %1
+}
diff --git a/llvm/test/CodeGen/SPIRV/bitcast.ll b/llvm/test/CodeGen/SPIRV/bitcast.ll
index 242c5a46583c2..d6c985dbadcc4 100644
--- a/llvm/test/CodeGen/SPIRV/bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/bitcast.ll
@@ -6,7 +6,7 @@
 ; CHECK-SPIRV-DAG: %[[#TyHalf:]] = OpTypeFloat 16
 ; CHECK-SPIRV-DAG: %[[#Arg32:]] = OpFunctionParameter %[[#TyInt32]]
 ; CHECK-SPIRV-DAG: %[[#Arg16:]] = OpUConvert %[[#TyInt16]] %[[#Arg32]]
-; CHECK-SPIRV-DAG: %[[#ValHalf:]] = OpBitcast %[[#TyHalf]] %8
+; CHECK-SPIRV-DAG: %[[#ValHalf:]] = OpBitcast %[[#TyHalf]] %[[#Arg16:]]
 ; CHECK-SPIRV-DAG: %[[#ValHalf2:]] = OpFMul %[[#TyHalf]] %[[#ValHalf]] %[[#ValHalf]]
 ; CHECK-SPIRV-DAG: %[[#Res16:]] = OpBitcast %[[#TyInt16]] %[[#ValHalf2]]
 ; CHECK-SPIRV-DAG: OpReturnValue %[[#Res16]]
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
index d12914d378542..090fe1578a36b 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-basic.ll
@@ -1,3 +1,6 @@
+; Issue #118011
+; XFAIL: *
+
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION
@@ -7,7 +10,7 @@
 ; CHECK-MIR-DAG: [[type_i32:%[0-9]+\:type]] = OpTypeInt 32, 0
 ; CHECK-MIR-DAG: [[encoding_signedchar:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i32]], 5
 ; CHECK-MIR-DAG: [[encoding_float:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i32]], 3
-; CHECK-MIR-DAG: [[flag_zero:%[0-9]+\:iid\(s32\)]] = OpConstantNull [[type_i32]]
+; CHECK-MIR-DAG: [[flag_zero:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i32]], 0
 ; CHECK-MIR-DAG: [[str_bool:%[0-9]+\:id\(s32\)]] = OpString 1819242338, 0
 ; CHECK-MIR-DAG: [[size_8bits:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i32]], 8
 ; CHECK-MIR-DAG: [[encoding_boolean:%[0-9]+\:iid\(s32\)]] = OpConstantI [[type_i32]], 2
@@ -60,7 +63,7 @@
 ; CHECK-SPIRV-DAG: [[type_int16:%[0-9]+]] = OpTypeInt 16 0
 ; CHECK-SPIRV-DAG: [[type_int32:%[0-9]+]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: [[encoding_signedchar:%[0-9]+]] = OpConstant [[type_int32]] 5
-; CHECK-SPIRV-DAG: [[flag_zero:%[0-9]+]] = OpConstantNull [[type_int32]]
+; CHECK-SPIRV-DAG: [[flag_zero:%[0-9]+]] = OpConstant [[type_int32]] 0
 ; CHECK-SPIRV-DAG: [[encoding_float:%[0-9]+]] = OpConstant [[type_int32]] 3
 ; CHECK-SPIRV-DAG: [[size_8bit:%[0-9]+]] = OpConstant [[type_int32]] 8
 ; CHECK-SPIRV-DAG: [[encoding_boolean:%[0-9]+]] = OpConstant [[type_int32]] 2
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
new file mode 100644
index 0000000000000..e21d99badea06
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-EXTENSION
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-NO-EXTENSION
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-EXTENSION: OpCapability OptNoneEXT
+; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone"
+; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
+; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneEXT
+; CHECK-NO-EXTENSION-NOT: OpExtension "SPV_INTEL_optnone"
+; CHECK-NO-EXTENSION-NOT: OpExtension "SPV_EXT_optnone"
+
+define spir_func void @foo() #0 {
+; CHECK-EXTENSION: %[[#]] = OpFunction %[[#]] DontInline|OptNoneEXT %[[#]]
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/builtin-op-wrappers.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/builtin-op-wrappers.ll
new file mode 100644
index 0000000000000..c30370c179516
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/builtin-op-wrappers.ll
@@ -0,0 +1,115 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_media_block_io %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_media_block_io %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: Capability SubgroupImageMediaBlockIOINTEL
+; CHECK: Extension "SPV_INTEL_media_block_io"
+; CHECK-COUNT-14: SubgroupImageMediaBlockReadINTEL
+; CHECK-COUNT-14: SubgroupImageMediaBlockWriteINTEL
+
+define spir_kernel void @intel_media_block_test(<2 x i32> %edgeCoord, ptr addrspace(1) %image_in, ptr addrspace(1) %image_out) !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_base_type !8 {
+entry:
+  %call = call spir_func i8 @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_RcharPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call1 = call spir_func <2 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call2 = call spir_func <4 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call3 = call spir_func <8 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call4 = call spir_func <16 x i8> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rchar16PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call5 = call spir_func i16 @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_RshortPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call6 = call spir_func <2 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call7 = call spir_func <4 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call8 = call spir_func <8 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call9 = call spir_func <16 x i16> @_Z49__spirv_SubgroupImageMediaBlockReadINTEL_Rshort16PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call10 = call spir_func i32 @_Z45__spirv_SubgroupImageMediaBlockReadINTEL_RintPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call11 = call spir_func <2 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call12 = call spir_func <4 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  %call13 = call spir_func <8 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1) %image_in, <2 x i32> %edgeCoord, i32 1, i32 16)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiic(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, i8 %call)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv2_c(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <2 x i8> %call1)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_c(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <4 x i8> %call2)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_c(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <8 x i8> %call3)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv16_c(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <16 x i8> %call4)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiis(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, i16 %call5)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv2_s(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <2 x i16> %call6)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_s(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <4 x i16> %call7)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_s(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <8 x i16> %call8)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv16_s(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <16 x i16> %call9)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiii(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, i32 %call10)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiS2_(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <2 x i32> %call11)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_i(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <4 x i32> %call12)
+  call spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_i(ptr addrspace(1) %image_out, <2 x i32> %edgeCoord, i32 1, i32 16, <8 x i32> %call13)
+  ret void
+}
+
+declare spir_func i8 @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_RcharPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <2 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <4 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <8 x i8> @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_Rchar8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <16 x i8> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rchar16PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func i16 @_Z47__spirv_SubgroupImageMediaBlockReadINTEL_RshortPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <2 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <4 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <8 x i16> @_Z48__spirv_SubgroupImageMediaBlockReadINTEL_Rshort8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <16 x i16> @_Z49__spirv_SubgroupImageMediaBlockReadINTEL_Rshort16PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func i32 @_Z45__spirv_SubgroupImageMediaBlockReadINTEL_RintPU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <2 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint2PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <4 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint4PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func <8 x i32> @_Z46__spirv_SubgroupImageMediaBlockReadINTEL_Rint8PU3AS133__spirv_Image__void_1_0_0_0_0_0_0Dv2_iii(ptr addrspace(1), <2 x i32>, i32, i32)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiic(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, i8)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv2_c(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <2 x i8>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_c(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <4 x i8>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_c(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <8 x i8>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv16_c(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <16 x i8>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiis(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, i16)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv2_s(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <2 x i16>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_s(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <4 x i16>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_s(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <8 x i16>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv16_s(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <16 x i16>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiii(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, i32)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiS2_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <2 x i32>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <4 x i32>)
+
+declare spir_func void @_Z41__spirv_SubgroupImageMediaBlockWriteINTELPU3AS133__spirv_Image__void_1_0_0_0_0_0_1Dv2_iiiDv8_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, i32, i32, <8 x i32>)
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+!opencl.used.extensions = !{!3}
+!opencl.used.optional.core.features = !{!4}
+!spirv.Generator = !{!5}
+
+!0 = !{i32 1, i32 2}
+!1 = !{i32 3, i32 200000}
+!2 = !{i32 2, i32 0}
+!3 = !{}
+!4 = !{!"cl_images"}
+!5 = !{i16 6, i16 14}
+!6 = !{i32 0, i32 1, i32 1}
+!7 = !{!"none", !"read_only", !"write_only"}
+!8 = !{!"int2", !"image2d_t", !"image2d_t"}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/cl_intel_media_block_io.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/cl_intel_media_block_io.ll
new file mode 100644
index 0000000000000..735094c7c8862
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_media_block_io/cl_intel_media_block_io.ll
@@ -0,0 +1,115 @@
+; Compiled from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_media_block_io/SPV_INTEL_media_block_io.cl
+
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: LLVM ERROR: intel_sub_group_media_block_read_uc: the builtin requires the following SPIR-V extension: SPV_INTEL_media_block_io
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_media_block_io %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_media_block_io %s -o - -filetype=obj | spirv-val %}
+; CHECK: Capability SubgroupImageMediaBlockIOINTEL
+; CHECK: Extension "SPV_INTEL_media_block_io"
+
+; CHECK-COUNT-14: SubgroupImageMediaBlockReadINTEL
+; CHECK-COUNT-14: SubgroupImageMediaBlockWriteINTEL
+
+define spir_kernel void @intel_media_block_test(<2 x i32> noundef %edgeCoord, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image) !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !6 {
+entry:
+  %call = tail call spir_func zeroext i8 @_Z35intel_sub_group_media_block_read_ucDv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call1 = tail call spir_func <2 x i8> @_Z36intel_sub_group_media_block_read_uc2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call2 = tail call spir_func <4 x i8> @_Z36intel_sub_group_media_block_read_uc4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call3 = tail call spir_func <8 x i8> @_Z36intel_sub_group_media_block_read_uc8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call4 = tail call spir_func <16 x i8> @_Z37intel_sub_group_media_block_read_uc16Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call5 = tail call spir_func zeroext i16 @_Z35intel_sub_group_media_block_read_usDv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call6 = tail call spir_func <2 x i16> @_Z36intel_sub_group_media_block_read_us2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call7 = tail call spir_func <4 x i16> @_Z36intel_sub_group_media_block_read_us4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call8 = tail call spir_func <8 x i16> @_Z36intel_sub_group_media_block_read_us8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call9 = tail call spir_func <16 x i16> @_Z37intel_sub_group_media_block_read_us16Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call10 = tail call spir_func i32 @_Z35intel_sub_group_media_block_read_uiDv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call11 = tail call spir_func <2 x i32> @_Z36intel_sub_group_media_block_read_ui2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call12 = tail call spir_func <4 x i32> @_Z36intel_sub_group_media_block_read_ui4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  %call13 = tail call spir_func <8 x i32> @_Z36intel_sub_group_media_block_read_ui8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src_luma_image)
+  tail call spir_func void @_Z36intel_sub_group_media_block_write_ucDv2_iiih14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, i8 noundef zeroext %call, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_uc2Dv2_iiiDv2_h14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <2 x i8> noundef %call1, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_uc4Dv2_iiiDv4_h14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <4 x i8> noundef %call2, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_uc8Dv2_iiiDv8_h14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <8 x i8> noundef %call3, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z38intel_sub_group_media_block_write_uc16Dv2_iiiDv16_h14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <16 x i8> noundef %call4, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z36intel_sub_group_media_block_write_usDv2_iiit14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, i16 noundef zeroext %call5, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_us2Dv2_iiiDv2_t14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <2 x i16> noundef %call6, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_us4Dv2_iiiDv4_t14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <4 x i16> noundef %call7, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_us8Dv2_iiiDv8_t14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <8 x i16> noundef %call8, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z38intel_sub_group_media_block_write_us16Dv2_iiiDv16_t14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <16 x i16> noundef %call9, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z36intel_sub_group_media_block_write_uiDv2_iiij14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, i32 noundef %call10, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_ui2Dv2_iiiDv2_j14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <2 x i32> noundef %call11, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_ui4Dv2_iiiDv4_j14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <4 x i32> noundef %call12, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  tail call spir_func void @_Z37intel_sub_group_media_block_write_ui8Dv2_iiiDv8_j14ocl_image2d_wo(<2 x i32> noundef %edgeCoord, i32 noundef 1, i32 noundef 16, <8 x i32> noundef %call13, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %dst_luma_image)
+  ret void
+}
+
+declare spir_func zeroext i8 @_Z35intel_sub_group_media_block_read_ucDv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <2 x i8> @_Z36intel_sub_group_media_block_read_uc2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <4 x i8> @_Z36intel_sub_group_media_block_read_uc4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <8 x i8> @_Z36intel_sub_group_media_block_read_uc8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <16 x i8> @_Z37intel_sub_group_media_block_read_uc16Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func zeroext i16 @_Z35intel_sub_group_media_block_read_usDv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <2 x i16> @_Z36intel_sub_group_media_block_read_us2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <4 x i16> @_Z36intel_sub_group_media_block_read_us4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <8 x i16> @_Z36intel_sub_group_media_block_read_us8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <16 x i16> @_Z37intel_sub_group_media_block_read_us16Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func i32 @_Z35intel_sub_group_media_block_read_uiDv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <2 x i32> @_Z36intel_sub_group_media_block_read_ui2Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <4 x i32> @_Z36intel_sub_group_media_block_read_ui4Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func <8 x i32> @_Z36intel_sub_group_media_block_read_ui8Dv2_iii14ocl_image2d_ro(<2 x i32> noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0))
+
+declare spir_func void @_Z36intel_sub_group_media_block_write_ucDv2_iiih14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, i8 noundef zeroext, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_uc2Dv2_iiiDv2_h14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <2 x i8> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_uc4Dv2_iiiDv4_h14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <4 x i8> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_uc8Dv2_iiiDv8_h14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <8 x i8> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z38intel_sub_group_media_block_write_uc16Dv2_iiiDv16_h14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <16 x i8> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z36intel_sub_group_media_block_write_usDv2_iiit14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, i16 noundef zeroext, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_us2Dv2_iiiDv2_t14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <2 x i16> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_us4Dv2_iiiDv4_t14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <4 x i16> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_us8Dv2_iiiDv8_t14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <8 x i16> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z38intel_sub_group_media_block_write_us16Dv2_iiiDv16_t14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <16 x i16> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z36intel_sub_group_media_block_write_uiDv2_iiij14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, i32 noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_ui2Dv2_iiiDv2_j14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <2 x i32> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_ui4Dv2_iiiDv4_j14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <4 x i32> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+declare spir_func void @_Z37intel_sub_group_media_block_write_ui8Dv2_iiiDv8_j14ocl_image2d_wo(<2 x i32> noundef, i32 noundef, i32 noundef, <8 x i32> noundef, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1))
+
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{!"clang version 20.0.0git (https://github.com/llvm/llvm-project.git 32da1fd8c7d45d5209c6c781910c51940779ec52)"}
+!3 = !{i32 0, i32 1, i32 1}
+!4 = !{!"none", !"read_only", !"write_only"}
+!5 = !{!"int2", !"image2d_t", !"image2d_t"}
+!6 = !{!"int __attribute__((ext_vector_type(2)))", !"image2d_t", !"image2d_t"}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll
index 1744ec9680401..9830b8b4cd2d8 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_optnone.ll
@@ -1,25 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-EXTENSION
-; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO-EXTENSION
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-EXTENSION
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-NO-EXTENSION
 
-; CHECK-EXTENSION: OpCapability OptNoneINTEL
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_optnone %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-EXTENSION: OpCapability OptNoneEXT
 ; CHECK-EXTENSION: OpExtension "SPV_INTEL_optnone"
 ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
+; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneEXT
 ; CHECK-NO-EXTENSION-NOT: OpExtension "SPV_INTEL_optnone"
+; CHECK-NO-EXTENSION-NOT: OpExtension "SPV_EXT_optnone"
 
-; Function Attrs: nounwind optnone noinline
 define spir_func void @_Z3foov() #0 {
-; CHECK-LABEL: _Z3foov
-; CHECK:       %4 = OpFunction %2 DontInline %3
-; CHECK-NEXT:    %5 = OpLabel
-; CHECK-NEXT:    OpReturn
-; CHECK-NEXT:    OpFunctionEnd
+; CHECK-EXTENSION: %[[#]] = OpFunction %[[#]] DontInline|OptNoneEXT %[[#]]
 entry:
   ret void
 }
 
 attributes #0 = { nounwind optnone noinline }
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-EXTENSION: {{.*}}
-; CHECK-NO-EXTENSION: {{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
index cd1a1b0080c62..55d638f80cc55 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
@@ -15,7 +15,7 @@
 ; CHECK-DAG: %[[GenPtrChar:.*]] = OpTypePointer Generic %[[Char]]
 ; CHECK-DAG: %[[CWPtrChar:.*]] = OpTypePointer CrossWorkgroup %[[Char]]
 ; CHECK-DAG: %[[Arr1:.*]] = OpTypeArray %[[CWPtrChar]] %[[#]]
-; CHECK-DAG: %[[Struct1:.*]] = OpTypeStruct %8
+; CHECK-DAG: %[[Struct1:.*]] = OpTypeStruct %[[Arr1]]
 ; CHECK-DAG: %[[Arr2:.*]] = OpTypeArray %[[GenPtrChar]] %[[#]]
 ; CHECK-DAG: %[[Struct2:.*]] = OpTypeStruct %[[Arr2]]
 ; CHECK-DAG: %[[GenPtr:.*]] = OpTypePointer Generic %[[Int]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/global-addrspacecast.ll
new file mode 100644
index 0000000000000..544c657da8488
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/global-addrspacecast.ll
@@ -0,0 +1,17 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+@PrivInternal = internal addrspace(10) global i32 456
+; CHECK-DAG:  %[[#type:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#ptrty:]] = OpTypePointer Private %[[#type]]
+; CHECK-DAG: %[[#value:]] = OpConstant %[[#type]] 456
+; CHECK-DAG:   %[[#var:]] = OpVariable %[[#ptrty]] Private %[[#value]]
+
+define spir_kernel void @Foo() {
+  %p = addrspacecast ptr addrspace(10) @PrivInternal to ptr
+  %v = load i32, ptr %p, align 4
+  ret void
+; CHECK:      OpLabel
+; CHECK-NEXT: OpLoad %[[#type]] %[[#var]] Aligned 4
+; CHECK-Next: OpReturn
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
index 7982893a0a913..16c20f9067e6e 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
@@ -11,7 +11,7 @@
 ; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyI64]] %[[TyI64]]
 ; CHECK-DAG: %[[Const128:.*]] = OpConstant %[[TyI64]] 128
 ; CHECK-DAG: %[[GlobalValue]] = OpVariable
-; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %12
+; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %[[GlobalValue]]
 ; TODO: The following bitcast line looks unneeded and we may expect it to be removed in future
 ; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] 124 %[[PtrToInt]]
 ; CHECK-DAG: %[[ConstComposite:.*]] = OpConstantComposite %[[TyStruct]] %[[Const128]] %[[UseGlobalValue]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll b/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll
new file mode 100644
index 0000000000000..985893029db89
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/tangled-ret.ll
@@ -0,0 +1,235 @@
+; The only pass criterion is that spirv-val considers output valid.
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+%subgr = type { i64, i64 }
+%t_range = type { %t_arr }
+%t_arr = type { [1 x i64] }
+%t_arr2 = type { [4 x i32] }
+
+define internal spir_func noundef i32 @geti32() {
+entry:
+  ret i32 100
+}
+
+define internal spir_func noundef i64 @geti64() {
+entry:
+  ret i64 200
+}
+
+define internal spir_func void @enable_if(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i64 noundef %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dim0.addr = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i64 %dim0, ptr %dim0.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %0 = load i64, ptr %dim0.addr, align 8
+  call spir_func void @enable_if_2(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this1, i64 noundef %0)
+  ret void
+}
+
+
+define internal spir_func void @test(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this, ptr addrspace(4) noundef align 4 dereferenceable(16) %bits, ptr noundef byval(%t_range) align 8 %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %bits.addr = alloca ptr addrspace(4), align 8
+  %cur_pos = alloca i64, align 8
+  %__range4 = alloca ptr addrspace(4), align 8
+  %__begin0 = alloca ptr addrspace(4), align 8
+  %__end0 = alloca ptr addrspace(4), align 8
+  %cleanup.dest.slot = alloca i32, align 4
+  %elem = alloca ptr addrspace(4), align 8
+  %agg.tmp = alloca %t_range, align 8
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store ptr addrspace(4) %bits, ptr %bits.addr, align 8
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %call = call spir_func noundef i64 @getp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %pos.ascast, i32 noundef 0)
+  store i64 %call, ptr %cur_pos, align 8
+  %0 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store ptr addrspace(4) %0, ptr %__range4, align 8
+  %1 = load ptr addrspace(4), ptr %__range4, align 8
+  %call2 = call spir_func noundef ptr addrspace(4) @beginp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %1)
+  store ptr addrspace(4) %call2, ptr %__begin0, align 8
+  %2 = load ptr addrspace(4), ptr %__range4, align 8
+  %call3 = call spir_func noundef ptr addrspace(4) @endp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %2)
+  store ptr addrspace(4) %call3, ptr %__end0, align 8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %3 = load ptr addrspace(4), ptr %__begin0, align 8
+  %4 = load ptr addrspace(4), ptr %__end0, align 8
+  %cmp = icmp ne ptr addrspace(4) %3, %4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = load ptr addrspace(4), ptr %__begin0, align 8
+  store ptr addrspace(4) %5, ptr %elem, align 8
+  %6 = load i64, ptr %cur_pos, align 8
+  %call4 = call spir_func noundef i32 @maskp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this1)
+  %conv = zext i32 %call4 to i64
+  %cmp5 = icmp ult i64 %6, %conv
+  br i1 %cmp5, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %7 = load ptr addrspace(4), ptr %elem, align 8
+  %8 = load i64, ptr %cur_pos, align 8
+  call spir_func void @enable_if(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %agg.tmp.ascast, i64 noundef %8)
+  call spir_func void @extract_bits(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this1, ptr addrspace(4) noundef align 4 dereferenceable(4) %7, ptr noundef byval(%t_range) align 8 %agg.tmp)
+  %9 = load i64, ptr %cur_pos, align 8
+  %add = add i64 %9, 32
+  store i64 %add, ptr %cur_pos, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %10 = load ptr addrspace(4), ptr %elem, align 8
+  store i32 0, ptr addrspace(4) %10, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load ptr addrspace(4), ptr %__begin0, align 8
+  %incdec.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %11, i32 1
+  store ptr addrspace(4) %incdec.ptr, ptr %__begin0, align 8
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+define internal spir_func noundef i64 @getp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i32 noundef %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4), align 8
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64, align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i, align 8
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i, align 8
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx, align 8
+  ret i64 %2
+}
+
+define internal spir_func noundef ptr addrspace(4) @beginp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca ptr addrspace(4), align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  ret ptr addrspace(4) %arraydecay2
+}
+
+define internal spir_func noundef ptr addrspace(4) @endp(ptr addrspace(4) noundef align 4 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca ptr addrspace(4), align 8
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  %add.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %arraydecay2, i64 4
+  ret ptr addrspace(4) %add.ptr
+}
+
+define internal spir_func noundef i32 @maskp(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4), align 8
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %bits_num = getelementptr inbounds nuw %subgr, ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = load i64, ptr addrspace(4) %bits_num, align 8
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+define internal spir_func void @enable_if_2(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %this, i64 noundef %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %dim0.addr = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store i64 %dim0, ptr %dim0.addr, align 8
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %dim0.addr, align 8
+  store i64 %0, ptr addrspace(4) %common_array1, align 8
+  ret void
+}
+
+define internal spir_func void @extract_bits(ptr addrspace(4) noundef align 8 dereferenceable_or_null(16) %this, ptr addrspace(4) noundef align 4 dereferenceable(4) %bits, ptr noundef byval(%t_range) align 8 %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %bits.addr = alloca ptr addrspace(4), align 8
+  %Res = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8
+  store ptr addrspace(4) %bits, ptr %bits.addr, align 8
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1, align 8
+  store i64 %0, ptr %Res, align 8
+  %bits_num = getelementptr inbounds nuw %subgr, ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num, align 8
+  %call = call spir_func noundef i64 @geti64()
+  %2 = load i64, ptr %Res, align 8
+  %and = and i64 %2, %call
+  store i64 %and, ptr %Res, align 8
+  %call2 = call spir_func noundef i64 @geti64()
+  %call3 = call spir_func noundef i32 @geti32()
+  %conv = zext i32 %call3 to i64
+  %cmp = icmp ult i64 %call2, %conv
+  br i1 %cmp, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  %3 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store i32 0, ptr addrspace(4) %3, align 4
+  br label %if.end11
+
+if.then:                                          ; preds = %entry
+  %call4 = call spir_func noundef i64 @geti64()
+  %cmp5 = icmp ugt i64 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then
+  %call7 = call spir_func noundef i64 @geti64()
+  %4 = load i64, ptr %Res, align 8
+  %shr = lshr i64 %4, %call7
+  store i64 %shr, ptr %Res, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then6, %if.then
+  %call8 = call spir_func noundef i64 @geti64()
+  %5 = load i64, ptr %Res, align 8
+  %and9 = and i64 %5, %call8
+  store i64 %and9, ptr %Res, align 8
+  %6 = load i64, ptr %Res, align 8
+  %conv10 = trunc i64 %6 to i32
+  %7 = load ptr addrspace(4), ptr %bits.addr, align 8
+  store i32 %conv10, ptr addrspace(4) %7, align 4
+  br label %if.end11
+
+if.end11:                                         ; preds = %if.else, %if.end
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class-vk.ll b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class-vk.ll
new file mode 100644
index 0000000000000..e8b1dc263f150
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class-vk.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#U32:]] = OpTypeInt 32 0
+
+; CHECK-DAG: %[[#VAL:]] = OpConstant %[[#U32]] 456
+; CHECK-DAG: %[[#VTYPE:]] = OpTypePointer Private %[[#U32]]
+; CHECK-DAG: %[[#VAR:]] = OpVariable %[[#VTYPE]] Private %[[#VAL]]
+; CHECK-NOT: OpDecorate %[[#VAR]] LinkageAttributes
+@PrivInternal = internal addrspace(10) global i32 456
+
+define void @main() {
+  %l = load i32, ptr addrspace(10) @PrivInternal
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll
index 2d4c805ac9df1..a1ded0569d67e 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/variables-storage-class.ll
@@ -1,17 +1,29 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
+; CHECK-DAG: %[[#U8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#U32:]] = OpTypeInt 32 0
+
+; CHECK-DAG: %[[#TYPE:]] = OpTypePointer CrossWorkgroup %[[#U8]]
+; CHECK-DAG: %[[#VAL:]] = OpConstantNull %[[#TYPE]]
+; CHECK-DAG: %[[#VTYPE:]] = OpTypePointer CrossWorkgroup %[[#TYPE]]
+; CHECK-DAG: %[[#PTR:]] = OpVariable %[[#VTYPE]] CrossWorkgroup %[[#VAL]]
 @Ptr = addrspace(1) global ptr addrspace(1) null
-@Init = private addrspace(2) constant i32 123
 
-; CHECK-DAG: %[[#PTR:]] = OpVariable %[[#]] UniformConstant %[[#]]
-; CHECK-DAG: %[[#INIT:]] = OpVariable %[[#]] CrossWorkgroup %[[#]]
+; CHECK-DAG: %[[#VAL:]] = OpConstant %[[#U32]] 123
+; CHECK-DAG: %[[#VTYPE:]] = OpTypePointer UniformConstant %[[#U32]]
+; CHECK-DAG: %[[#INIT:]] = OpVariable %[[#VTYPE]] UniformConstant %[[#VAL]]
+@Init = private addrspace(2) constant i32 123
 
-; CHECK: %[[#]] = OpLoad %[[#]] %[[#INIT]] Aligned 8
-; CHECK: OpCopyMemorySized %[[#]] %[[#PTR]] %[[#]] Aligned 4
+; CHECK-DAG: %[[#VAL:]] = OpConstant %[[#U32]] 456
+; CHECK-DAG: %[[#VTYPE:]] = OpTypePointer Private %[[#U32]]
+; CHECK-DAG: %[[#]] = OpVariable %[[#VTYPE]] Private %[[#VAL]]
+@PrivInternal = internal addrspace(10) global i32 456
 
 define spir_kernel void @Foo() {
+  ; CHECK: %[[#]] = OpLoad %[[#]] %[[#PTR]] Aligned 8
   %l = load ptr addrspace(1), ptr addrspace(1) @Ptr, align 8
+  ; CHECK: OpCopyMemorySized %[[#]] %[[#INIT]] %[[#]] Aligned 4
   call void @llvm.memcpy.p1.p2.i64(ptr addrspace(1) align 4 %l, ptr addrspace(2) align 1 @Init, i64 4, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
index c98fef3631e04..ee5596ed38b1b 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpPhi_ArgumentsPlaceholders.ll
@@ -12,7 +12,8 @@
 ;;     }
 ;; }
 
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; XFAIL: *
 
 %struct.Node = type { %struct.Node.0 addrspace(1)* }
@@ -25,8 +26,8 @@ entry:
 for.cond:                                         ; preds = %for.inc, %entry
   %pNode.0 = phi %struct.Node addrspace(1)* [ %pNodes, %entry ], [ %1, %for.inc ]
   %j.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-; CHECK-SPIRV:      %[[#]] = OpPhi %[[#]] %[[#]] %[[#]] %[[#BitcastResultId:]] %[[#]]
-; CHECK-SPIRV-NEXT: OpPhi
+; CHECK:      %[[#]] = OpPhi %[[#]] %[[#]] %[[#]] %[[#BitcastResultId:]] %[[#]]
+; CHECK-NEXT: OpPhi
 
   %cmp = icmp slt i32 %j.0, 10
   br i1 %cmp, label %for.body, label %for.end
@@ -36,8 +37,8 @@ for.body:                                         ; preds = %for.cond
 
   %0 = load %struct.Node.0 addrspace(1)*, %struct.Node.0 addrspace(1)* addrspace(1)* %pNext, align 4
   %1 = bitcast %struct.Node.0 addrspace(1)* %0 to %struct.Node addrspace(1)*
-; CHECK-SPIRV: %[[#LoadResultId:]] = OpLoad %[[#]]
-; CHECK-SPIRV: %[[#BitcastResultId]] = OpBitcast %[[#]] %[[#LoadResultId]]
+; CHECK: %[[#LoadResultId:]] = OpLoad %[[#]]
+; CHECK: %[[#BitcastResultId]] = OpBitcast %[[#]] %[[#LoadResultId]]
 
   br label %for.inc
 
diff --git a/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
new file mode 100644
index 0000000000000..b6b919f36d92c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/validate/sycl-tangle-group-algorithms.ll
@@ -0,0 +1,4673 @@
+; This is an excerpt from the SYCL end-to-end test suite, cleaned out from
+; unrelevant details, that reproduced cases of invalid SPIR-V generation due
+; to wrong types, deduced from the input LLVM IR. Namely, this test case covers
+; cases of type mismatch when null pointer constant is used in different
+; contexts and so with different pointee types, and intertwined
+; load/store/function call LLVM IR input with bitcasts inserted between
+; instruction uses.
+
+; The only pass criterion is that spirv-val considers output valid.
+
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+%"nd_item" = type { i8 }
+%struct.AssertHappened = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 }
+%"range" = type { %"detail::array" }
+%"detail::array" = type { [1 x i64] }
+%class.anon = type { %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor" }
+%"accessor" = type { %"detail::AccessorImplDevice", %union.anon }
+%"detail::AccessorImplDevice" = type { %"range", %"range", %"range" }
+%union.anon = type { ptr addrspace(1) }
+%class.anon.6 = type { ptr addrspace(4), ptr addrspace(4), ptr addrspace(4), ptr addrspace(4) }
+%"group" = type { %"range", %"range", %"range", %"range" }
+%"item" = type { %"detail::AccessorImplDevice" }
+%"item.22" = type { %"sd_ItemBase.23" }
+%"sd_ItemBase.23" = type { %"range", %"range" }
+%"tangle_group" = type { %"ss_sub_group_mask" }
+%"ss_sub_group_mask" = type { i64, i64 }
+%class.anon.8 = type { %"accessor", %"accessor", [8 x i8], %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor", %"accessor" }
+%"vec.16" = type { %"struct.std::array.20" }
+%"struct.std::array.20" = type { [4 x i32] }
+%class.anon.15 = type { ptr addrspace(4), ptr addrspace(4), ptr addrspace(4) }
+%class.anon.7 = type { ptr addrspace(4), ptr addrspace(4) }
+
+@.str = private unnamed_addr addrspace(1) constant [21 x i8] c"bits_num <= max_bits\00", align 1
+@.str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"subgroupmask.hpp\00", align 1
+@__PRETTY_FUNCTION1 = private unnamed_addr addrspace(1) constant [32 x i8] c"subgroup_mask(BitsType, size_t)\00", align 1
+@.str.2 = private unnamed_addr addrspace(1) constant [15 x i8] c"bn <= max_bits\00", align 1
+@__PRETTY_FUNCTION2 = private unnamed_addr addrspace(1) constant [52 x i8] c"BitsType subgroup_mask::valuable_bits(size_t) const\00", align 1
+@__spirv_BuiltInSubgroupMaxSize = external dso_local addrspace(1) constant i32, align 4
+@__spirv_BuiltInSubgroupLocalInvocationId = external dso_local addrspace(1) constant i32, align 4
+@_ZSt6ignore = linkonce_odr dso_local addrspace(1) constant %"nd_item" undef, align 1
+@__spirv_BuiltInNumWorkgroups = external dso_local addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInGlobalOffset = external dso_local addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInGlobalInvocationId = external dso_local addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInGlobalSize = external dso_local addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInLocalInvocationId = external dso_local addrspace(1) constant <3 x i64>, align 32
+@SPIR_AssertHappenedMem = linkonce_odr dso_local addrspace(1) global %struct.AssertHappened zeroinitializer
+@__spirv_BuiltInWorkgroupId = external dso_local addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInWorkgroupSize = external dso_local addrspace(1) constant <3 x i64>, align 32
+
+
+define weak_odr dso_local spir_kernel void @TestKernel(ptr addrspace(1) %_arg_TmpAcc, ptr byval(%"range") %_arg_TmpAcc1, ptr byval(%"range") %_arg_TmpAcc2, ptr byval(%"range") %_arg_TmpAcc3, ptr addrspace(1) align 1 %_arg_BarrierAcc, ptr byval(%"range") %_arg_BarrierAcc4, ptr byval(%"range") %_arg_BarrierAcc5, ptr byval(%"range") %_arg_BarrierAcc6, ptr addrspace(1) align 1 %_arg_BroadcastAcc, ptr byval(%"range") %_arg_BroadcastAcc7, ptr byval(%"range") %_arg_BroadcastAcc8, ptr byval(%"range") %_arg_BroadcastAcc9, ptr addrspace(1) align 1 %_arg_AnyAcc, ptr byval(%"range") %_arg_AnyAcc10, ptr byval(%"range") %_arg_AnyAcc11, ptr byval(%"range") %_arg_AnyAcc12, ptr addrspace(1) align 1 %_arg_AllAcc, ptr byval(%"range") %_arg_AllAcc13, ptr byval(%"range") %_arg_AllAcc14, ptr byval(%"range") %_arg_AllAcc15, ptr addrspace(1) align 1 %_arg_NoneAcc, ptr byval(%"range") %_arg_NoneAcc16, ptr byval(%"range") %_arg_NoneAcc17, ptr byval(%"range") %_arg_NoneAcc18, ptr addrspace(1) align 1 %_arg_ReduceAcc, ptr byval(%"range") %_arg_ReduceAcc19, ptr byval(%"range") %_arg_ReduceAcc20, ptr byval(%"range") %_arg_ReduceAcc21, ptr addrspace(1) align 1 %_arg_ExScanAcc, ptr byval(%"range") %_arg_ExScanAcc22, ptr byval(%"range") %_arg_ExScanAcc23, ptr byval(%"range") %_arg_ExScanAcc24, ptr addrspace(1) align 1 %_arg_IncScanAcc, ptr byval(%"range") %_arg_IncScanAcc25, ptr byval(%"range") %_arg_IncScanAcc26, ptr byval(%"range") %_arg_IncScanAcc27, ptr addrspace(1) align 1 %_arg_ShiftLeftAcc, ptr byval(%"range") %_arg_ShiftLeftAcc28, ptr byval(%"range") %_arg_ShiftLeftAcc29, ptr byval(%"range") %_arg_ShiftLeftAcc30, ptr addrspace(1) align 1 %_arg_ShiftRightAcc, ptr byval(%"range") %_arg_ShiftRightAcc31, ptr byval(%"range") %_arg_ShiftRightAcc32, ptr byval(%"range") %_arg_ShiftRightAcc33, ptr addrspace(1) align 1 %_arg_SelectAcc, ptr byval(%"range") %_arg_SelectAcc34, ptr byval(%"range") %_arg_SelectAcc35, ptr byval(%"range") %_arg_SelectAcc36, ptr addrspace(1) align 1 %_arg_PermuteXorAcc, ptr byval(%"range") %_arg_PermuteXorAcc37, ptr byval(%"range") %_arg_PermuteXorAcc38, ptr byval(%"range") %_arg_PermuteXorAcc39) {
+entry:
+  %_arg_TmpAcc.addr = alloca ptr addrspace(1)
+  %_arg_BarrierAcc.addr = alloca ptr addrspace(1)
+  %_arg_BroadcastAcc.addr = alloca ptr addrspace(1)
+  %_arg_AnyAcc.addr = alloca ptr addrspace(1)
+  %_arg_AllAcc.addr = alloca ptr addrspace(1)
+  %_arg_NoneAcc.addr = alloca ptr addrspace(1)
+  %_arg_ReduceAcc.addr = alloca ptr addrspace(1)
+  %_arg_ExScanAcc.addr = alloca ptr addrspace(1)
+  %_arg_IncScanAcc.addr = alloca ptr addrspace(1)
+  %_arg_ShiftLeftAcc.addr = alloca ptr addrspace(1)
+  %_arg_ShiftRightAcc.addr = alloca ptr addrspace(1)
+  %_arg_SelectAcc.addr = alloca ptr addrspace(1)
+  %_arg_PermuteXorAcc.addr = alloca ptr addrspace(1)
+  %Kernel = alloca %class.anon
+  %agg.tmp = alloca %"range"
+  %agg.tmp41 = alloca %"range"
+  %agg.tmp42 = alloca %"range"
+  %agg.tmp44 = alloca %"range"
+  %agg.tmp45 = alloca %"range"
+  %agg.tmp46 = alloca %"range"
+  %agg.tmp48 = alloca %"range"
+  %agg.tmp49 = alloca %"range"
+  %agg.tmp50 = alloca %"range"
+  %agg.tmp52 = alloca %"range"
+  %agg.tmp53 = alloca %"range"
+  %agg.tmp54 = alloca %"range"
+  %agg.tmp56 = alloca %"range"
+  %agg.tmp57 = alloca %"range"
+  %agg.tmp58 = alloca %"range"
+  %agg.tmp60 = alloca %"range"
+  %agg.tmp61 = alloca %"range"
+  %agg.tmp62 = alloca %"range"
+  %agg.tmp64 = alloca %"range"
+  %agg.tmp65 = alloca %"range"
+  %agg.tmp66 = alloca %"range"
+  %agg.tmp68 = alloca %"range"
+  %agg.tmp69 = alloca %"range"
+  %agg.tmp70 = alloca %"range"
+  %agg.tmp72 = alloca %"range"
+  %agg.tmp73 = alloca %"range"
+  %agg.tmp74 = alloca %"range"
+  %agg.tmp76 = alloca %"range"
+  %agg.tmp77 = alloca %"range"
+  %agg.tmp78 = alloca %"range"
+  %agg.tmp80 = alloca %"range"
+  %agg.tmp81 = alloca %"range"
+  %agg.tmp82 = alloca %"range"
+  %agg.tmp84 = alloca %"range"
+  %agg.tmp85 = alloca %"range"
+  %agg.tmp86 = alloca %"range"
+  %agg.tmp88 = alloca %"range"
+  %agg.tmp89 = alloca %"range"
+  %agg.tmp90 = alloca %"range"
+  %agg.tmp91 = alloca %"nd_item", align 1
+  %Kernel.ascast = addrspacecast ptr %Kernel to ptr addrspace(4)
+  %agg.tmp91.ascast = addrspacecast ptr %agg.tmp91 to ptr addrspace(4)
+  store ptr addrspace(1) %_arg_TmpAcc, ptr %_arg_TmpAcc.addr
+  store ptr addrspace(1) %_arg_BarrierAcc, ptr %_arg_BarrierAcc.addr
+  store ptr addrspace(1) %_arg_BroadcastAcc, ptr %_arg_BroadcastAcc.addr
+  store ptr addrspace(1) %_arg_AnyAcc, ptr %_arg_AnyAcc.addr
+  store ptr addrspace(1) %_arg_AllAcc, ptr %_arg_AllAcc.addr
+  store ptr addrspace(1) %_arg_NoneAcc, ptr %_arg_NoneAcc.addr
+  store ptr addrspace(1) %_arg_ReduceAcc, ptr %_arg_ReduceAcc.addr
+  store ptr addrspace(1) %_arg_ExScanAcc, ptr %_arg_ExScanAcc.addr
+  store ptr addrspace(1) %_arg_IncScanAcc, ptr %_arg_IncScanAcc.addr
+  store ptr addrspace(1) %_arg_ShiftLeftAcc, ptr %_arg_ShiftLeftAcc.addr
+  store ptr addrspace(1) %_arg_ShiftRightAcc, ptr %_arg_ShiftRightAcc.addr
+  store ptr addrspace(1) %_arg_SelectAcc, ptr %_arg_SelectAcc.addr
+  store ptr addrspace(1) %_arg_PermuteXorAcc, ptr %_arg_PermuteXorAcc.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %Kernel.ascast to ptr addrspace(4)
+  call spir_func void @Foo1(ptr addrspace(4) %TmpAcc1) 
+  %BarrierAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 1
+  call spir_func void @Foo2(ptr addrspace(4) %BarrierAcc) 
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 2
+  call spir_func void @Foo2(ptr addrspace(4) %BroadcastAcc) 
+  %AnyAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 3
+  call spir_func void @Foo2(ptr addrspace(4) %AnyAcc) 
+  %AllAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 4
+  call spir_func void @Foo2(ptr addrspace(4) %AllAcc) 
+  %NoneAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 5
+  call spir_func void @Foo2(ptr addrspace(4) %NoneAcc) 
+  %ReduceAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 6
+  call spir_func void @Foo2(ptr addrspace(4) %ReduceAcc) 
+  %ExScanAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 7
+  call spir_func void @Foo2(ptr addrspace(4) %ExScanAcc) 
+  %IncScanAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 8
+  call spir_func void @Foo2(ptr addrspace(4) %IncScanAcc) 
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 9
+  call spir_func void @Foo2(ptr addrspace(4) %ShiftLeftAcc) 
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 10
+  call spir_func void @Foo2(ptr addrspace(4) %ShiftRightAcc) 
+  %SelectAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 11
+  call spir_func void @Foo2(ptr addrspace(4) %SelectAcc) 
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 12
+  call spir_func void @Foo2(ptr addrspace(4) %PermuteXorAcc) 
+  %TmpAcc402 = bitcast ptr addrspace(4) %Kernel.ascast to ptr addrspace(4)
+  %0 = load ptr addrspace(1), ptr %_arg_TmpAcc.addr
+  call spir_func void @Foo3(ptr addrspace(4) %TmpAcc402, ptr addrspace(1) %0, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp41, ptr byval(%"range") %agg.tmp42) 
+  %BarrierAcc43 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr %_arg_BarrierAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %BarrierAcc43, ptr addrspace(1) %1, ptr byval(%"range") %agg.tmp44, ptr byval(%"range") %agg.tmp45, ptr byval(%"range") %agg.tmp46) 
+  %BroadcastAcc47 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 2
+  %2 = load ptr addrspace(1), ptr %_arg_BroadcastAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %BroadcastAcc47, ptr addrspace(1) %2, ptr byval(%"range") %agg.tmp48, ptr byval(%"range") %agg.tmp49, ptr byval(%"range") %agg.tmp50) 
+  %AnyAcc51 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 3
+  %3 = load ptr addrspace(1), ptr %_arg_AnyAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %AnyAcc51, ptr addrspace(1) %3, ptr byval(%"range") %agg.tmp52, ptr byval(%"range") %agg.tmp53, ptr byval(%"range") %agg.tmp54) 
+  %AllAcc55 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 4
+  %4 = load ptr addrspace(1), ptr %_arg_AllAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %AllAcc55, ptr addrspace(1) %4, ptr byval(%"range") %agg.tmp56, ptr byval(%"range") %agg.tmp57, ptr byval(%"range") %agg.tmp58) 
+  %NoneAcc59 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 5
+  %5 = load ptr addrspace(1), ptr %_arg_NoneAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %NoneAcc59, ptr addrspace(1) %5, ptr byval(%"range") %agg.tmp60, ptr byval(%"range") %agg.tmp61, ptr byval(%"range") %agg.tmp62) 
+  %ReduceAcc63 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 6
+  %6 = load ptr addrspace(1), ptr %_arg_ReduceAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ReduceAcc63, ptr addrspace(1) %6, ptr byval(%"range") %agg.tmp64, ptr byval(%"range") %agg.tmp65, ptr byval(%"range") %agg.tmp66) 
+  %ExScanAcc67 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 7
+  %7 = load ptr addrspace(1), ptr %_arg_ExScanAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ExScanAcc67, ptr addrspace(1) %7, ptr byval(%"range") %agg.tmp68, ptr byval(%"range") %agg.tmp69, ptr byval(%"range") %agg.tmp70) 
+  %IncScanAcc71 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 8
+  %8 = load ptr addrspace(1), ptr %_arg_IncScanAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %IncScanAcc71, ptr addrspace(1) %8, ptr byval(%"range") %agg.tmp72, ptr byval(%"range") %agg.tmp73, ptr byval(%"range") %agg.tmp74) 
+  %ShiftLeftAcc75 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 9
+  %9 = load ptr addrspace(1), ptr %_arg_ShiftLeftAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ShiftLeftAcc75, ptr addrspace(1) %9, ptr byval(%"range") %agg.tmp76, ptr byval(%"range") %agg.tmp77, ptr byval(%"range") %agg.tmp78) 
+  %ShiftRightAcc79 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 10
+  %10 = load ptr addrspace(1), ptr %_arg_ShiftRightAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %ShiftRightAcc79, ptr addrspace(1) %10, ptr byval(%"range") %agg.tmp80, ptr byval(%"range") %agg.tmp81, ptr byval(%"range") %agg.tmp82) 
+  %SelectAcc83 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 11
+  %11 = load ptr addrspace(1), ptr %_arg_SelectAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %SelectAcc83, ptr addrspace(1) %11, ptr byval(%"range") %agg.tmp84, ptr byval(%"range") %agg.tmp85, ptr byval(%"range") %agg.tmp86) 
+  %PermuteXorAcc87 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %Kernel.ascast, i32 0, i32 12
+  %12 = load ptr addrspace(1), ptr %_arg_PermuteXorAcc.addr
+  call spir_func void @Foo4(ptr addrspace(4) %PermuteXorAcc87, ptr addrspace(1) %12, ptr byval(%"range") %agg.tmp88, ptr byval(%"range") %agg.tmp89, ptr byval(%"range") %agg.tmp90) 
+  %call = call spir_func ptr addrspace(4) @Foo5() 
+  call spir_func void @Foo6(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %agg.tmp91.ascast, ptr addrspace(4) %call) 
+  call spir_func void @Foo22(ptr addrspace(4) %Kernel.ascast, ptr byval(%"nd_item") align 1 %agg.tmp91) 
+  ret void
+}
+
+define internal spir_func void @Foo1(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %agg.tmp3 = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %agg.tmp3.ascast = addrspacecast ptr %agg.tmp3 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  call void @llvm.memset.p0.i64(ptr %agg.tmp, i8 0, i64 8, i1 false)
+  call spir_func void @Foo11(ptr addrspace(4) %agg.tmp.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp3.ascast) 
+  call spir_func void @Foo10(ptr addrspace(4) %impl1, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp2, ptr byval(%"range") %agg.tmp3) 
+  ret void
+}
+
+
+define internal spir_func void @Foo2(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %agg.tmp3 = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %agg.tmp3.ascast = addrspacecast ptr %agg.tmp3 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  call void @llvm.memset.p0.i64(ptr %agg.tmp, i8 0, i64 8, i1 false)
+  call spir_func void @Foo11(ptr addrspace(4) %agg.tmp.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast) 
+  call spir_func void @Foo12(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp3.ascast) 
+  call spir_func void @Foo10(ptr addrspace(4) %impl1, ptr byval(%"range") %agg.tmp, ptr byval(%"range") %agg.tmp2, ptr byval(%"range") %agg.tmp3) 
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo3(ptr addrspace(4) %this, ptr addrspace(1) %Ptr, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemRange, ptr byval(%"range") %Offset) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %Ptr.addr = alloca ptr addrspace(1)
+  %ref.tmp = alloca %class.anon.6
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(1) %Ptr, ptr %Ptr.addr
+  %AccessRange.ascast = addrspacecast ptr %AccessRange to ptr addrspace(4)
+  %MemRange.ascast = addrspacecast ptr %MemRange to ptr addrspace(4)
+  %Offset.ascast = addrspacecast ptr %Offset to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load ptr addrspace(1), ptr %Ptr.addr
+  %1 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  store ptr addrspace(1) %0, ptr addrspace(4) %1
+  %2 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %2
+  %Offset2 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Offset.ascast, ptr %Offset2
+  %AccessRange3 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %AccessRange.ascast, ptr %AccessRange3
+  %MemRange4 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 3
+  store ptr addrspace(4) %MemRange.ascast, ptr %MemRange4
+  call spir_func void @Foo13(ptr addrspace(4) %ref.tmp.ascast) 
+  %call = call spir_func i64 @Foo21(ptr addrspace(4) %this1) 
+  %3 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load ptr addrspace(1), ptr addrspace(4) %3
+  %add.ptr = getelementptr inbounds nuw i64, ptr addrspace(1) %4, i64 %call
+  store ptr addrspace(1) %add.ptr, ptr addrspace(4) %3
+  ret void
+}
+
+
+define internal spir_func void @Foo4(ptr addrspace(4) %this, ptr addrspace(1) %Ptr, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemRange, ptr byval(%"range") %Offset) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %Ptr.addr = alloca ptr addrspace(1)
+  %ref.tmp = alloca %class.anon.6
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(1) %Ptr, ptr %Ptr.addr
+  %AccessRange.ascast = addrspacecast ptr %AccessRange to ptr addrspace(4)
+  %MemRange.ascast = addrspacecast ptr %MemRange to ptr addrspace(4)
+  %Offset.ascast = addrspacecast ptr %Offset to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load ptr addrspace(1), ptr %Ptr.addr
+  %1 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  store ptr addrspace(1) %0, ptr addrspace(4) %1
+  %2 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %2
+  %Offset2 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Offset.ascast, ptr %Offset2
+  %AccessRange3 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %AccessRange.ascast, ptr %AccessRange3
+  %MemRange4 = getelementptr inbounds %class.anon.6, ptr %ref.tmp, i32 0, i32 3
+  store ptr addrspace(4) %MemRange.ascast, ptr %MemRange4
+  call spir_func void @Foo30(ptr addrspace(4) %ref.tmp.ascast) 
+  %call = call spir_func i64 @Foo32(ptr addrspace(4) %this1) 
+  %3 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load ptr addrspace(1), ptr addrspace(4) %3
+  %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(1) %4, i64 %call
+  store ptr addrspace(1) %add.ptr, ptr addrspace(4) %3
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo5() {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  ret ptr addrspace(4) null
+}
+
+
+define internal spir_func void @Foo6(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %0) {
+entry:
+  %.addr = alloca ptr addrspace(4)
+  %GlobalSize = alloca %"range"
+  %LocalSize = alloca %"range"
+  %GroupRange = alloca %"range"
+  %GroupId = alloca %"range"
+  %GlobalId = alloca %"range"
+  %LocalId = alloca %"range"
+  %GlobalOffset = alloca %"range"
+  %Group = alloca %"group"
+  %GlobalItem = alloca %"item"
+  %LocalItem = alloca %"item.22"
+  %cleanup.dest.slot = alloca i32, align 4
+  %GlobalSize.ascast = addrspacecast ptr %GlobalSize to ptr addrspace(4)
+  %LocalSize.ascast = addrspacecast ptr %LocalSize to ptr addrspace(4)
+  %GroupRange.ascast = addrspacecast ptr %GroupRange to ptr addrspace(4)
+  %GroupId.ascast = addrspacecast ptr %GroupId to ptr addrspace(4)
+  %GlobalId.ascast = addrspacecast ptr %GlobalId to ptr addrspace(4)
+  %LocalId.ascast = addrspacecast ptr %LocalId to ptr addrspace(4)
+  %GlobalOffset.ascast = addrspacecast ptr %GlobalOffset to ptr addrspace(4)
+  %Group.ascast = addrspacecast ptr %Group to ptr addrspace(4)
+  %GlobalItem.ascast = addrspacecast ptr %GlobalItem to ptr addrspace(4)
+  %LocalItem.ascast = addrspacecast ptr %LocalItem to ptr addrspace(4)
+  store ptr addrspace(4) %0, ptr %.addr
+  call spir_func void @Foo7(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalSize.ascast) 
+  call spir_func void @Init1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %LocalSize.ascast) 
+  call spir_func void @Init2(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GroupRange.ascast) 
+  call spir_func void @Init3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GroupId.ascast) 
+  call spir_func void @Init6(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalId.ascast) 
+  call spir_func void @Init4(ptr addrspace(4) dead_on_unwind writable sret(%"range") %LocalId.ascast) 
+  call spir_func void @Init5(ptr addrspace(4) dead_on_unwind writable sret(%"range") %GlobalOffset.ascast) 
+  call spir_func void @Foo23(ptr addrspace(4) dead_on_unwind writable sret(%"group") %Group.ascast, ptr addrspace(4) %GlobalSize.ascast, ptr addrspace(4) %LocalSize.ascast, ptr addrspace(4) %GroupRange.ascast, ptr addrspace(4) %GroupId.ascast) 
+  call spir_func void @Foo24(ptr addrspace(4) dead_on_unwind writable sret(%"item") %GlobalItem.ascast, ptr addrspace(4) %GlobalSize.ascast, ptr addrspace(4) %GlobalId.ascast, ptr addrspace(4) %GlobalOffset.ascast) 
+  call spir_func void @Foo25(ptr addrspace(4) dead_on_unwind writable sret(%"item.22") %LocalItem.ascast, ptr addrspace(4) %LocalSize.ascast, ptr addrspace(4) %LocalId.ascast) 
+  call spir_func void @Foo26(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %GlobalItem.ascast, ptr addrspace(4) %LocalItem.ascast, ptr addrspace(4) %Group.ascast) 
+  ret void
+}
+
+
+define internal spir_func void @Foo22(ptr addrspace(4) %this, ptr byval(%"nd_item") align 1 %item) {
+entry:
+  %this.addr.i76 = alloca ptr addrspace(4)
+  %WI.addr.i = alloca i64
+  %TangleLeader.addr.i = alloca i64
+  %TangleSize.addr.i = alloca i64
+  %agg.tmp.i = alloca %"range"
+  %agg.tmp2.i = alloca %"tangle_group"
+  %Visible.i = alloca i64
+  %Other.i = alloca i64
+  %agg.tmp5.i = alloca %"range"
+  %agg.tmp8.i = alloca %"range"
+  %OriginalLID.i = alloca i32, align 4
+  %LID.i = alloca i32, align 4
+  %BroadcastResult.i = alloca i32, align 4
+  %agg.tmp12.i = alloca %"tangle_group"
+  %agg.tmp15.i = alloca %"range"
+  %AnyResult.i = alloca i8, align 1
+  %agg.tmp18.i = alloca %"tangle_group"
+  %agg.tmp24.i = alloca %"range"
+  %AllResult.i = alloca i8, align 1
+  %agg.tmp27.i = alloca %"tangle_group"
+  %agg.tmp35.i = alloca %"range"
+  %NoneResult.i = alloca i8, align 1
+  %agg.tmp38.i = alloca %"tangle_group"
+  %agg.tmp46.i = alloca %"range"
+  %ReduceResult.i = alloca i32, align 4
+  %agg.tmp49.i = alloca %"tangle_group"
+  %agg.tmp50.i = alloca %"nd_item", align 1
+  %agg.tmp54.i = alloca %"range"
+  %ExScanResult.i = alloca i32, align 4
+  %agg.tmp57.i = alloca %"tangle_group"
+  %agg.tmp58.i = alloca %"nd_item", align 1
+  %agg.tmp61.i = alloca %"range"
+  %IncScanResult.i = alloca i32, align 4
+  %agg.tmp64.i = alloca %"tangle_group"
+  %agg.tmp65.i = alloca %"nd_item", align 1
+  %agg.tmp69.i = alloca %"range"
+  %ShiftLeftResult.i = alloca i32, align 4
+  %agg.tmp72.i = alloca %"tangle_group"
+  %agg.tmp79.i = alloca %"range"
+  %ShiftRightResult.i = alloca i32, align 4
+  %agg.tmp82.i = alloca %"tangle_group"
+  %agg.tmp88.i = alloca %"range"
+  %SelectResult.i = alloca i32, align 4
+  %agg.tmp91.i = alloca %"tangle_group"
+  %agg.tmp92.i = alloca %"range"
+  %ref.tmp.i = alloca %"range"
+  %ref.tmp93.i = alloca %"range"
+  %ref.tmp94.i = alloca i32, align 4
+  %agg.tmp100.i = alloca %"range"
+  %PermuteXorResult.i = alloca i32, align 4
+  %agg.tmp103.i = alloca %"tangle_group"
+  %agg.tmp106.i = alloca %"range"
+  %agg.tmp18.ascast.ascast75 = alloca %"nd_item"
+  %agg.tmp17.ascast.ascast74 = alloca %"tangle_group"
+  %retval.i66 = alloca i64
+  %this.addr.i67 = alloca ptr addrspace(4)
+  %Result.i68 = alloca i64
+  %retval.i58 = alloca i64
+  %this.addr.i59 = alloca ptr addrspace(4)
+  %Result.i60 = alloca i64
+  %retval.i50 = alloca i64
+  %this.addr.i51 = alloca ptr addrspace(4)
+  %Result.i52 = alloca i64
+  %retval.i42 = alloca i64
+  %this.addr.i43 = alloca ptr addrspace(4)
+  %Result.i44 = alloca i64
+  %retval.i = alloca i64
+  %this.addr.i = alloca ptr addrspace(4)
+  %Result.i = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %WI = alloca %"range"
+  %SG = alloca %"nd_item", align 1
+  %BranchBody = alloca %class.anon.8
+  %ref.tmp = alloca %"range"
+  %ref.tmp15 = alloca i32, align 4
+  %Tangle = alloca %"tangle_group"
+  %agg.tmp = alloca %"nd_item", align 1
+  %TangleLeader = alloca i64
+  %TangleSize = alloca i64
+  %IsMember = alloca %"nd_item", align 1
+  %agg.tmp17 = alloca %"tangle_group"
+  %agg.tmp18 = alloca %"nd_item", align 1
+  %ref.tmp19 = alloca %"range"
+  %ref.tmp20 = alloca i32, align 4
+  %Tangle24 = alloca %"tangle_group"
+  %agg.tmp25 = alloca %"nd_item", align 1
+  %TangleLeader26 = alloca i64
+  %TangleSize27 = alloca i64
+  %IsMember28 = alloca %"nd_item", align 1
+  %agg.tmp30 = alloca %"tangle_group"
+  %agg.tmp31 = alloca %"nd_item", align 1
+  %Tangle33 = alloca %"tangle_group"
+  %agg.tmp34 = alloca %"nd_item", align 1
+  %TangleLeader35 = alloca i64
+  %TangleSize36 = alloca i64
+  %IsMember37 = alloca %"nd_item", align 1
+  %agg.tmp39 = alloca %"tangle_group"
+  %agg.tmp40 = alloca %"nd_item", align 1
+  %WI.ascast = addrspacecast ptr %WI to ptr addrspace(4)
+  %SG.ascast = addrspacecast ptr %SG to ptr addrspace(4)
+  %BranchBody.ascast = addrspacecast ptr %BranchBody to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp15.ascast = addrspacecast ptr %ref.tmp15 to ptr addrspace(4)
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %ref.tmp19.ascast = addrspacecast ptr %ref.tmp19 to ptr addrspace(4)
+  %ref.tmp20.ascast = addrspacecast ptr %ref.tmp20 to ptr addrspace(4)
+  %Tangle24.ascast = addrspacecast ptr %Tangle24 to ptr addrspace(4)
+  %IsMember28.ascast = addrspacecast ptr %IsMember28 to ptr addrspace(4)
+  %Tangle33.ascast = addrspacecast ptr %Tangle33 to ptr addrspace(4)
+  %IsMember37.ascast = addrspacecast ptr %IsMember37 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %item.ascast = addrspacecast ptr %item to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo40(ptr addrspace(4) dead_on_unwind writable sret(%"range") %WI.ascast, ptr addrspace(4) align 1 %item.ascast) 
+  call spir_func void @Foo41(ptr addrspace(4) dead_on_unwind writable sret(%"nd_item") align 1 %SG.ascast, ptr addrspace(4) align 1 %item.ascast) 
+  %TmpAcc1 = bitcast ptr %BranchBody to ptr
+  %TmpAcc22 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %BarrierAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 1
+  %BarrierAcc3 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = getelementptr inbounds i8, ptr addrspace(4) %BranchBody.ascast, i64 64
+  %BroadcastAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 3
+  %BroadcastAcc4 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 2
+  %AnyAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 4
+  %AnyAcc5 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 3
+  %AllAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 5
+  %AllAcc6 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 4
+  %NoneAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 6
+  %NoneAcc7 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 5
+  %ReduceAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 7
+  %ReduceAcc8 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 6
+  %ExScanAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 8
+  %ExScanAcc9 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 7
+  %IncScanAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 9
+  %IncScanAcc10 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 8
+  %ShiftLeftAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 10
+  %ShiftLeftAcc11 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 9
+  %ShiftRightAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 11
+  %ShiftRightAcc12 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 10
+  %SelectAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 12
+  %SelectAcc13 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 11
+  %PermuteXorAcc = getelementptr inbounds %class.anon.8, ptr %BranchBody, i32 0, i32 13
+  %PermuteXorAcc14 = getelementptr inbounds nuw %class.anon, ptr addrspace(4) %this1, i32 0, i32 12
+  store i32 4, ptr %ref.tmp15, align 4
+  call spir_func void @Foo42(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %WI.ascast, ptr addrspace(4) align 4 %ref.tmp15.ascast) 
+  %retval.ascast.i69 = addrspacecast ptr %retval.i66 to ptr addrspace(4)
+  store ptr addrspace(4) %ref.tmp.ascast, ptr %this.addr.i67
+  %this1.i72 = load ptr addrspace(4), ptr %this.addr.i67
+  %1 = load i64, ptr addrspace(4) %this1.i72
+  store i64 %1, ptr %Result.i68
+  %2 = load i64, ptr %Result.i68
+  %tobool = icmp ne i64 %2, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  store i32 24, ptr %ref.tmp20, align 4
+  call spir_func void @Foo42(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp19.ascast, ptr addrspace(4) %WI.ascast, ptr addrspace(4) align 4 %ref.tmp20.ascast) 
+  %retval.ascast.i53 = addrspacecast ptr %retval.i50 to ptr addrspace(4)
+  store ptr addrspace(4) %ref.tmp19.ascast, ptr %this.addr.i51
+  %this1.i56 = load ptr addrspace(4), ptr %this.addr.i51
+  %3 = load i64, ptr addrspace(4) %this1.i56
+  store i64 %3, ptr %Result.i52
+  %4 = load i64, ptr %Result.i52
+  %tobool22 = icmp ne i64 %4, 0
+  br i1 %tobool22, label %if.then23, label %if.else32
+
+if.else32:                                        ; preds = %if.else
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle33.ascast, ptr byval(%"nd_item") align 1 %agg.tmp34) 
+  store i64 24, ptr %TangleLeader35
+  store i64 8, ptr %TangleSize36
+  %retval.ascast.i = addrspacecast ptr %retval.i to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %5 = load i64, ptr addrspace(4) %this1.i
+  store i64 %5, ptr %Result.i
+  %6 = load i64, ptr %Result.i
+  %7 = load i64, ptr %TangleLeader35
+  %8 = load i64, ptr %TangleSize36
+  call spir_func void @Foo69(ptr addrspace(4) %BranchBody.ascast, i64 %6, ptr byval(%"tangle_group") %agg.tmp39, i64 %7, i64 %8, ptr byval(%"nd_item") align 1 %agg.tmp40) 
+  br label %if.end41
+
+if.then23:                                        ; preds = %if.else
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle24.ascast, ptr byval(%"nd_item") align 1 %agg.tmp25) 
+  store i64 4, ptr %TangleLeader26
+  store i64 20, ptr %TangleSize27
+  %retval.ascast.i45 = addrspacecast ptr %retval.i42 to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i43
+  %this1.i48 = load ptr addrspace(4), ptr %this.addr.i43
+  %9 = load i64, ptr addrspace(4) %this1.i48
+  store i64 %9, ptr %Result.i44
+  %10 = load i64, ptr %Result.i44
+  %11 = load i64, ptr %TangleLeader26
+  %12 = load i64, ptr %TangleSize27
+  call spir_func void @Foo68(ptr addrspace(4) %BranchBody.ascast, i64 %10, ptr byval(%"tangle_group") %agg.tmp30, i64 %11, i64 %12, ptr byval(%"nd_item") align 1 %agg.tmp31) 
+  br label %if.end41
+
+if.then:                                          ; preds = %entry
+  call spir_func void @Foo43(ptr addrspace(4) dead_on_unwind writable sret(%"tangle_group") %Tangle.ascast, ptr byval(%"nd_item") align 1 %agg.tmp) 
+  store i64 0, ptr %TangleLeader
+  store i64 4, ptr %TangleSize
+  %retval.ascast.i61 = addrspacecast ptr %retval.i58 to ptr addrspace(4)
+  store ptr addrspace(4) %WI.ascast, ptr %this.addr.i59
+  %this1.i64 = load ptr addrspace(4), ptr %this.addr.i59
+  %13 = load i64, ptr addrspace(4) %this1.i64
+  store i64 %13, ptr %Result.i60
+  %14 = load i64, ptr %Result.i60
+  %15 = load i64, ptr %TangleLeader
+  %16 = load i64, ptr %TangleSize
+  %TangleSize.addr.ascast.i = addrspacecast ptr %TangleSize.addr.i to ptr addrspace(4)
+  %agg.tmp.ascast.i = addrspacecast ptr %agg.tmp.i to ptr addrspace(4)
+  %agg.tmp5.ascast.i = addrspacecast ptr %agg.tmp5.i to ptr addrspace(4)
+  %agg.tmp8.ascast.i = addrspacecast ptr %agg.tmp8.i to ptr addrspace(4)
+  %agg.tmp15.ascast.i = addrspacecast ptr %agg.tmp15.i to ptr addrspace(4)
+  %agg.tmp24.ascast.i = addrspacecast ptr %agg.tmp24.i to ptr addrspace(4)
+  %agg.tmp35.ascast.i = addrspacecast ptr %agg.tmp35.i to ptr addrspace(4)
+  %agg.tmp46.ascast.i = addrspacecast ptr %agg.tmp46.i to ptr addrspace(4)
+  %agg.tmp50.ascast.i = addrspacecast ptr %agg.tmp50.i to ptr addrspace(4)
+  %agg.tmp54.ascast.i = addrspacecast ptr %agg.tmp54.i to ptr addrspace(4)
+  %agg.tmp58.ascast.i = addrspacecast ptr %agg.tmp58.i to ptr addrspace(4)
+  %agg.tmp61.ascast.i = addrspacecast ptr %agg.tmp61.i to ptr addrspace(4)
+  %agg.tmp65.ascast.i = addrspacecast ptr %agg.tmp65.i to ptr addrspace(4)
+  %agg.tmp69.ascast.i = addrspacecast ptr %agg.tmp69.i to ptr addrspace(4)
+  %agg.tmp79.ascast.i = addrspacecast ptr %agg.tmp79.i to ptr addrspace(4)
+  %agg.tmp88.ascast.i = addrspacecast ptr %agg.tmp88.i to ptr addrspace(4)
+  %agg.tmp92.ascast.i = addrspacecast ptr %agg.tmp92.i to ptr addrspace(4)
+  %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4)
+  %ref.tmp93.ascast.i = addrspacecast ptr %ref.tmp93.i to ptr addrspace(4)
+  %ref.tmp94.ascast.i = addrspacecast ptr %ref.tmp94.i to ptr addrspace(4)
+  %agg.tmp100.ascast.i = addrspacecast ptr %agg.tmp100.i to ptr addrspace(4)
+  %agg.tmp106.ascast.i = addrspacecast ptr %agg.tmp106.i to ptr addrspace(4)
+  store ptr addrspace(4) %BranchBody.ascast, ptr %this.addr.i76
+  store i64 %14, ptr %WI.addr.i
+  %Tangle.ascast.i = addrspacecast ptr %agg.tmp17.ascast.ascast74 to ptr addrspace(4)
+  store i64 %15, ptr %TangleLeader.addr.i
+  store i64 %16, ptr %TangleSize.addr.i
+  %IsMember.ascast.i = addrspacecast ptr %agg.tmp18.ascast.ascast75 to ptr addrspace(4)
+  %this1.i78 = load ptr addrspace(4), ptr %this.addr.i76
+  %17 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast.i, i64 %17) 
+  %call.i = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this1.i78, ptr byval(%"range") %agg.tmp.i) 
+  store i64 1, ptr addrspace(4) %call.i
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2.i, i32 1) 
+  store i64 0, ptr %Visible.i
+  store i64 0, ptr %Other.i
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %if.end.i, %if.then
+  %18 = load i64, ptr %Other.i
+  %cmp.i79 = icmp ult i64 %18, 32
+  br i1 %cmp.i79, label %for.body.i, label %for.cond.cleanup.i
+
+for.cond.cleanup.i:                               ; preds = %for.cond.i
+  %19 = load i64, ptr %Visible.i
+  %20 = load i64, ptr %TangleSize.addr.i
+  %cmp7.i = icmp eq i64 %19, %20
+  %BarrierAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 1
+  %21 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast.i, i64 %21) 
+  %call9.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc.i, ptr byval(%"range") %agg.tmp8.i) 
+  %storedv.i = zext i1 %cmp7.i to i8
+  store i8 %storedv.i, ptr addrspace(4) %call9.i, align 1
+  %22 = getelementptr inbounds i8, ptr addrspace(4) %this1.i78, i64 64
+  %call10.i = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %22) 
+  store i32 %call10.i, ptr %OriginalLID.i, align 4
+  %call11.i = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast.i) 
+  store i32 %call11.i, ptr %LID.i, align 4
+  %23 = load i32, ptr %OriginalLID.i, align 4
+  %call13.i = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12.i, i32 %23, i32 0) 
+  store i32 %call13.i, ptr %BroadcastResult.i, align 4
+  %24 = load i32, ptr %BroadcastResult.i, align 4
+  %conv.i = zext i32 %24 to i64
+  %25 = load i64, ptr %TangleLeader.addr.i
+  %cmp14.i = icmp eq i64 %conv.i, %25
+  %BroadcastAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 3
+  %26 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast.i, i64 %26) 
+  %call16.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc.i, ptr byval(%"range") %agg.tmp15.i) 
+  %storedv17.i = zext i1 %cmp14.i to i8
+  store i8 %storedv17.i, ptr addrspace(4) %call16.i, align 1
+  %27 = load i32, ptr %LID.i, align 4
+  %cmp19.i = icmp eq i32 %27, 0
+  %call20.i = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18.i, i1 zeroext %cmp19.i) 
+  %storedv21.i = zext i1 %call20.i to i8
+  store i8 %storedv21.i, ptr %AnyResult.i, align 1
+  %28 = load i8, ptr %AnyResult.i, align 1  
+  %loadedv.i = trunc i8 %28 to i1
+  %conv22.i = zext i1 %loadedv.i to i32
+  %AnyAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 4
+  %29 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast.i, i64 %29) 
+  %call25.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc.i, ptr byval(%"range") %agg.tmp24.i) 
+  %storedv26.i = zext i1 %loadedv.i to i8
+  store i8 %storedv26.i, ptr addrspace(4) %call25.i, align 1
+  %30 = load i32, ptr %LID.i, align 4
+  %conv28.i = zext i32 %30 to i64
+  %31 = load i64, ptr %TangleSize.addr.i
+  %cmp29.i = icmp ult i64 %conv28.i, %31
+  %call30.i = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27.i, i1 zeroext %cmp29.i) 
+  %storedv31.i = zext i1 %call30.i to i8
+  store i8 %storedv31.i, ptr %AllResult.i, align 1
+  %32 = load i8, ptr %AllResult.i, align 1  
+  %loadedv32.i = trunc i8 %32 to i1
+  %conv33.i = zext i1 %loadedv32.i to i32
+  %AllAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 5
+  %33 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast.i, i64 %33) 
+  %call36.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc.i, ptr byval(%"range") %agg.tmp35.i) 
+  %storedv37.i = zext i1 %loadedv32.i to i8
+  store i8 %storedv37.i, ptr addrspace(4) %call36.i, align 1
+  %34 = load i32, ptr %LID.i, align 4
+  %conv39.i = zext i32 %34 to i64
+  %35 = load i64, ptr %TangleSize.addr.i
+  %cmp40.i = icmp uge i64 %conv39.i, %35
+  %call41.i = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38.i, i1 zeroext %cmp40.i) 
+  %storedv42.i = zext i1 %call41.i to i8
+  store i8 %storedv42.i, ptr %NoneResult.i, align 1
+  %36 = load i8, ptr %NoneResult.i, align 1  
+  %loadedv43.i = trunc i8 %36 to i1
+  %conv44.i = zext i1 %loadedv43.i to i32
+  %NoneAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 6
+  %37 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast.i, i64 %37) 
+  %call47.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc.i, ptr byval(%"range") %agg.tmp46.i) 
+  %storedv48.i = zext i1 %loadedv43.i to i8
+  store i8 %storedv48.i, ptr addrspace(4) %call47.i, align 1
+  %call51.i = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50.i) 
+  store i32 %call51.i, ptr %ReduceResult.i, align 4
+  %38 = load i32, ptr %ReduceResult.i, align 4
+  %conv52.i = zext i32 %38 to i64
+  %39 = load i64, ptr %TangleSize.addr.i
+  %cmp53.i = icmp eq i64 %conv52.i, %39
+  %ReduceAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 7
+  %40 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast.i, i64 %40) 
+  %call55.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc.i, ptr byval(%"range") %agg.tmp54.i) 
+  %storedv56.i = zext i1 %cmp53.i to i8
+  store i8 %storedv56.i, ptr addrspace(4) %call55.i, align 1
+  %call59.i = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58.i) 
+  store i32 %call59.i, ptr %ExScanResult.i, align 4
+  %41 = load i32, ptr %ExScanResult.i, align 4
+  %42 = load i32, ptr %LID.i, align 4
+  %cmp60.i = icmp eq i32 %41, %42
+  %ExScanAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 8
+  %43 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast.i, i64 %43) 
+  %call62.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc.i, ptr byval(%"range") %agg.tmp61.i) 
+  %storedv63.i = zext i1 %cmp60.i to i8
+  store i8 %storedv63.i, ptr addrspace(4) %call62.i, align 1
+  %call66.i = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64.i, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65.i) 
+  store i32 %call66.i, ptr %IncScanResult.i, align 4
+  %44 = load i32, ptr %IncScanResult.i, align 4
+  %45 = load i32, ptr %LID.i, align 4
+  %add67.i = add i32 %45, 1
+  %cmp68.i = icmp eq i32 %44, %add67.i
+  %IncScanAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 9
+  %46 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast.i, i64 %46) 
+  %call70.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc.i, ptr byval(%"range") %agg.tmp69.i) 
+  %storedv71.i = zext i1 %cmp68.i to i8
+  store i8 %storedv71.i, ptr addrspace(4) %call70.i, align 1
+  %47 = load i32, ptr %LID.i, align 4
+  %call73.i = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72.i, i32 %47, i32 2) 
+  store i32 %call73.i, ptr %ShiftLeftResult.i, align 4
+  %48 = load i32, ptr %LID.i, align 4
+  %add74.i = add i32 %48, 2
+  %conv75.i = zext i32 %add74.i to i64
+  %49 = load i64, ptr %TangleSize.addr.i
+  %cmp76.i = icmp uge i64 %conv75.i, %49
+  br i1 %cmp76.i, label %lor.end.i, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %for.cond.cleanup.i
+  %50 = load i32, ptr %ShiftLeftResult.i, align 4
+  %51 = load i32, ptr %LID.i, align 4
+  %add77.i = add i32 %51, 2
+  %cmp78.i = icmp eq i32 %50, %add77.i
+  br label %lor.end.i
+
+lor.end.i:                                        ; preds = %lor.rhs.i, %for.cond.cleanup.i
+  %52 = phi i1 [ true, %for.cond.cleanup.i ], [ %cmp78.i, %lor.rhs.i ]
+  %ShiftLeftAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 10
+  %53 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast.i, i64 %53) 
+  %call80.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc.i, ptr byval(%"range") %agg.tmp79.i) 
+  %storedv81.i = zext i1 %52 to i8
+  store i8 %storedv81.i, ptr addrspace(4) %call80.i, align 1
+  %54 = load i32, ptr %LID.i, align 4
+  %call83.i = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82.i, i32 %54, i32 2) 
+  store i32 %call83.i, ptr %ShiftRightResult.i, align 4
+  %55 = load i32, ptr %LID.i, align 4
+  %cmp84.i = icmp ult i32 %55, 2
+  br i1 %cmp84.i, label %l1.exit, label %lor.rhs85.i
+
+lor.rhs85.i:                                      ; preds = %lor.end.i
+  %56 = load i32, ptr %ShiftRightResult.i, align 4
+  %57 = load i32, ptr %LID.i, align 4
+  %sub.i = sub i32 %57, 2
+  %cmp86.i = icmp eq i32 %56, %sub.i
+  br label %l1.exit
+
+l1.exit: ; preds = %lor.rhs85.i, %lor.end.i
+  %58 = phi i1 [ true, %lor.end.i ], [ %cmp86.i, %lor.rhs85.i ]
+  %ShiftRightAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 11
+  %59 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast.i, i64 %59) 
+  %call89.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc.i, ptr byval(%"range") %agg.tmp88.i) 
+  %storedv90.i = zext i1 %58 to i8
+  store i8 %storedv90.i, ptr addrspace(4) %call89.i, align 1
+  %60 = load i32, ptr %LID.i, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast.i, ptr addrspace(4) %Tangle.ascast.i) 
+  store i32 2, ptr %ref.tmp94.i, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast.i, ptr addrspace(4) %ref.tmp93.ascast.i, ptr addrspace(4) align 4 %ref.tmp94.ascast.i) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast.i, ptr addrspace(4) %ref.tmp.ascast.i, ptr addrspace(4) %TangleSize.addr.ascast.i) 
+  %call95.i = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91.i, i32 %60, ptr byval(%"range") %agg.tmp92.i) 
+  store i32 %call95.i, ptr %SelectResult.i, align 4
+  %61 = load i32, ptr %SelectResult.i, align 4
+  %conv96.i = zext i32 %61 to i64
+  %62 = load i32, ptr %LID.i, align 4
+  %add97.i = add i32 %62, 2
+  %conv98.i = zext i32 %add97.i to i64
+  %63 = load i64, ptr %TangleSize.addr.i
+  %rem.i = urem i64 %conv98.i, %63
+  %cmp99.i = icmp eq i64 %conv96.i, %rem.i
+  %SelectAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 12
+  %64 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast.i, i64 %64) 
+  %call101.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc.i, ptr byval(%"range") %agg.tmp100.i) 
+  %storedv102.i = zext i1 %cmp99.i to i8
+  store i8 %storedv102.i, ptr addrspace(4) %call101.i, align 1
+  %65 = load i32, ptr %LID.i, align 4
+  %call104.i = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103.i, i32 %65, i32 2) 
+  store i32 %call104.i, ptr %PermuteXorResult.i, align 4
+  %66 = load i32, ptr %PermuteXorResult.i, align 4
+  %67 = load i32, ptr %LID.i, align 4
+  %xor.i = xor i32 %67, 2
+  %cmp105.i = icmp eq i32 %66, %xor.i
+  %PermuteXorAcc.i = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1.i78, i32 0, i32 13
+  %68 = load i64, ptr %WI.addr.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast.i, i64 %68) 
+  %call107.i = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc.i, ptr byval(%"range") %agg.tmp106.i) 
+  %storedv108.i = zext i1 %cmp105.i to i8
+  store i8 %storedv108.i, ptr addrspace(4) %call107.i, align 1
+  br label %if.end41
+
+if.end41:                                         ; preds = %if.then23, %if.else32, %l1.exit
+  ret void
+
+for.body.i:                                       ; preds = %for.cond.i
+  %69 = load i64, ptr %Other.i
+  %call3.i = call spir_func zeroext i1 @Foo71(ptr addrspace(4) align 1 %IsMember.ascast.i, i64 %69) 
+  br i1 %call3.i, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %for.body.i
+  %70 = load i64, ptr %Other.i
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast.i, i64 %70) 
+  %call6.i = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this1.i78, ptr byval(%"range") %agg.tmp5.i) 
+  %71 = load i64, ptr addrspace(4) %call6.i
+  %72 = load i64, ptr %Visible.i
+  %add.i = add i64 %72, %71
+  store i64 %add.i, ptr %Visible.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %for.body.i
+  %73 = load i64, ptr %Other.i
+  %inc.i = add i64 %73, 1
+  store i64 %inc.i, ptr %Other.i
+  br label %for.cond.i
+}
+
+define internal spir_func void @Foo40(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Init6(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+define internal spir_func void @Foo41(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo42(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) align 4 %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %conv = sext i32 %5 to i64
+  %cmp1 = icmp ult i64 %3, %conv
+  %conv2 = zext i1 %cmp1 to i64
+  %common_array32 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %arrayidx5 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array32, i64 0, i64 %idxprom4
+  store i64 %conv2, ptr addrspace(4) %arrayidx5
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+declare void @llvm.assume(i1) 
+
+
+define internal spir_func void @Foo43(ptr addrspace(4) dead_on_unwind noalias writable sret(%"tangle_group") %agg.result, ptr byval(%"nd_item") align 1 %group) {
+entry:
+  %mask = alloca %"ss_sub_group_mask"
+  %agg.tmp = alloca %"nd_item", align 1
+  %agg.tmp1 = alloca %"ss_sub_group_mask"
+  %cleanup.dest.slot = alloca i32, align 4
+  %mask.ascast = addrspacecast ptr %mask to ptr addrspace(4)
+  %group.ascast = addrspacecast ptr %group to ptr addrspace(4)
+  call spir_func void @Foo44(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %mask.ascast, ptr byval(%"nd_item") align 1 %agg.tmp, i1 zeroext true) 
+  call spir_func void @Foo45(ptr addrspace(4) %agg.result, ptr byval(%"ss_sub_group_mask") %agg.tmp1) 
+  ret void
+}
+
+
+define internal spir_func void @Foo46(ptr addrspace(4) %this, i64 %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %dim0.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 %0) 
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %this, ptr byval(%"range") %Index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %LinearIndex = alloca i64
+  %agg.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Foo93(ptr addrspace(4) %this1, ptr byval(%"range") %agg.tmp) 
+  store i64 %call, ptr %LinearIndex
+  %call2 = call spir_func ptr addrspace(1) @Foo94(ptr addrspace(4) %this1) 
+  %0 = load i64, ptr %LinearIndex
+  %arrayidx = getelementptr inbounds nuw i64, ptr addrspace(1) %call2, i64 %0
+  %arrayidx.ascast = addrspacecast ptr addrspace(1) %arrayidx to ptr addrspace(4)
+  ret ptr addrspace(4) %arrayidx.ascast
+}
+
+
+define internal spir_func void @Foo75(ptr byval(%"tangle_group") %G, i32 %FenceScope) {
+entry:
+  %FenceScope.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  store i32 %FenceScope, ptr %FenceScope.addr, align 4
+  %0 = load i32, ptr %FenceScope.addr, align 4
+  call spir_func void @Foo95(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 5) 
+  ret void
+}
+
+
+define internal spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %this, ptr byval(%"range") %Index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %LinearIndex = alloca i64
+  %agg.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Foo77(ptr addrspace(4) %this1, ptr byval(%"range") %agg.tmp) 
+  store i64 %call, ptr %LinearIndex
+  %call2 = call spir_func ptr addrspace(1) @Foo78(ptr addrspace(4) %this1) 
+  %0 = load i64, ptr %LinearIndex
+  %arrayidx = getelementptr inbounds nuw i8, ptr addrspace(1) %call2, i64 %0
+  %arrayidx.ascast = addrspacecast ptr addrspace(1) %arrayidx to ptr addrspace(4)
+  ret ptr addrspace(4) %arrayidx.ascast
+}
+
+
+define internal spir_func i32 @Foo76(ptr addrspace(4) align 1 %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo96(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) align 1 %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func i32 @Foo90(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func i32 @Foo91(ptr byval(%"tangle_group") %g, i32 %x, i32 %linear_local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %linear_local_id.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %agg.tmp2 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %agg.tmp1.ascast = addrspacecast ptr %agg.tmp1 to ptr addrspace(4)
+  %agg.tmp2.ascast = addrspacecast ptr %agg.tmp2 to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %linear_local_id, ptr %linear_local_id.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  call spir_func void @Foo97(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp2.ascast, ptr addrspace(4) %g.ascast) 
+  %1 = load i32, ptr %linear_local_id.addr, align 4
+  %conv = zext i32 %1 to i64
+  call spir_func void @Foo98(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp1.ascast, ptr byval(%"range") %agg.tmp2, i64 %conv) 
+  %call = call spir_func i32 @Bar69(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func zeroext i1 @Bar10(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %g, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %0 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %lnot = xor i1 %loadedv, true
+  %call = call spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %agg.tmp, i1 zeroext %lnot) 
+  ret i1 %call
+}
+
+
+define internal spir_func i32 @Foo64(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar11(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo63(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar12(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  store i32 %call, ptr %res, align 4
+  %1 = load i32, ptr %res, align 4
+  ret i32 %1
+}
+
+
+define internal spir_func i32 @Foo62(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"nd_item") align 1 %binary_op) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %binary_op.ascast = addrspacecast ptr %binary_op to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo61(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %0, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo73(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %delta.addr, align 4
+  %call = call spir_func i32 @Foo72(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 %1) 
+  ret i32 %call
+}
+
+
+define internal spir_func zeroext i1 @Foo71(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp ult i64 %0, 4
+  ret i1 %cmp
+}
+
+
+define internal spir_func i32 @Foo53(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %delta.addr, align 4
+  %call = call spir_func i32 @Foo52(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i32 %1) 
+  ret i32 %call
+}
+
+
+define internal spir_func void @Foo51(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %call = call spir_func i32 @Foo47(ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func void @Foo55(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) align 4 %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %conv = sext i32 %5 to i64
+  %add = add i64 %3, %conv
+  %common_array13 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  store i64 %add, ptr addrspace(4) %arrayidx3
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func void @Foo56(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %5 = load i64, ptr addrspace(4) %4
+  %rem = urem i64 %3, %5
+  %common_array13 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %6 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %6 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  store i64 %rem, ptr addrspace(4) %arrayidx3
+  %7 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func i32 @Foo57(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo59(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Foo58(ptr byval(%"tangle_group") %g, i32 %x, i32 %mask) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %mask.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %agg.tmp1.ascast = addrspacecast ptr %agg.tmp1 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %mask, ptr %mask.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %mask.addr, align 4
+  %conv = zext i32 %1 to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp1.ascast, i64 %conv) 
+  %call = call spir_func i32 @Bar13(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func void @Foo68(ptr addrspace(4) %this, i64 %WI, ptr byval(%"tangle_group") %Tangle, i64 %TangleLeader, i64 %TangleSize, ptr byval(%"nd_item") align 1 %IsMember) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %WI.addr = alloca i64
+  %TangleLeader.addr = alloca i64
+  %TangleSize.addr = alloca i64
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"tangle_group"
+  %Visible = alloca i64
+  %Other = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %agg.tmp5 = alloca %"range"
+  %agg.tmp8 = alloca %"range"
+  %OriginalLID = alloca i32, align 4
+  %LID = alloca i32, align 4
+  %BroadcastResult = alloca i32, align 4
+  %agg.tmp12 = alloca %"tangle_group"
+  %agg.tmp15 = alloca %"range"
+  %AnyResult = alloca i8, align 1
+  %agg.tmp18 = alloca %"tangle_group"
+  %agg.tmp24 = alloca %"range"
+  %AllResult = alloca i8, align 1
+  %agg.tmp27 = alloca %"tangle_group"
+  %agg.tmp35 = alloca %"range"
+  %NoneResult = alloca i8, align 1
+  %agg.tmp38 = alloca %"tangle_group"
+  %agg.tmp46 = alloca %"range"
+  %ReduceResult = alloca i32, align 4
+  %agg.tmp49 = alloca %"tangle_group"
+  %agg.tmp50 = alloca %"nd_item", align 1
+  %agg.tmp54 = alloca %"range"
+  %ExScanResult = alloca i32, align 4
+  %agg.tmp57 = alloca %"tangle_group"
+  %agg.tmp58 = alloca %"nd_item", align 1
+  %agg.tmp61 = alloca %"range"
+  %IncScanResult = alloca i32, align 4
+  %agg.tmp64 = alloca %"tangle_group"
+  %agg.tmp65 = alloca %"nd_item", align 1
+  %agg.tmp69 = alloca %"range"
+  %ShiftLeftResult = alloca i32, align 4
+  %agg.tmp72 = alloca %"tangle_group"
+  %agg.tmp79 = alloca %"range"
+  %ShiftRightResult = alloca i32, align 4
+  %agg.tmp82 = alloca %"tangle_group"
+  %agg.tmp88 = alloca %"range"
+  %SelectResult = alloca i32, align 4
+  %agg.tmp91 = alloca %"tangle_group"
+  %agg.tmp92 = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %ref.tmp93 = alloca %"range"
+  %ref.tmp94 = alloca i32, align 4
+  %agg.tmp100 = alloca %"range"
+  %PermuteXorResult = alloca i32, align 4
+  %agg.tmp103 = alloca %"tangle_group"
+  %agg.tmp106 = alloca %"range"
+  %TangleSize.addr.ascast = addrspacecast ptr %TangleSize.addr to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp5.ascast = addrspacecast ptr %agg.tmp5 to ptr addrspace(4)
+  %agg.tmp8.ascast = addrspacecast ptr %agg.tmp8 to ptr addrspace(4)
+  %agg.tmp15.ascast = addrspacecast ptr %agg.tmp15 to ptr addrspace(4)
+  %agg.tmp24.ascast = addrspacecast ptr %agg.tmp24 to ptr addrspace(4)
+  %agg.tmp35.ascast = addrspacecast ptr %agg.tmp35 to ptr addrspace(4)
+  %agg.tmp46.ascast = addrspacecast ptr %agg.tmp46 to ptr addrspace(4)
+  %agg.tmp54.ascast = addrspacecast ptr %agg.tmp54 to ptr addrspace(4)
+  %agg.tmp61.ascast = addrspacecast ptr %agg.tmp61 to ptr addrspace(4)
+  %agg.tmp69.ascast = addrspacecast ptr %agg.tmp69 to ptr addrspace(4)
+  %agg.tmp79.ascast = addrspacecast ptr %agg.tmp79 to ptr addrspace(4)
+  %agg.tmp88.ascast = addrspacecast ptr %agg.tmp88 to ptr addrspace(4)
+  %agg.tmp92.ascast = addrspacecast ptr %agg.tmp92 to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp93.ascast = addrspacecast ptr %ref.tmp93 to ptr addrspace(4)
+  %ref.tmp94.ascast = addrspacecast ptr %ref.tmp94 to ptr addrspace(4)
+  %agg.tmp100.ascast = addrspacecast ptr %agg.tmp100 to ptr addrspace(4)
+  %agg.tmp106.ascast = addrspacecast ptr %agg.tmp106 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %WI, ptr %WI.addr
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  store i64 %TangleLeader, ptr %TangleLeader.addr
+  store i64 %TangleSize, ptr %TangleSize.addr
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %0) 
+  %call = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc1, ptr byval(%"range") %agg.tmp) 
+  store i64 1, ptr addrspace(4) %call
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2, i32 1) 
+  store i64 0, ptr %Visible
+  store i64 0, ptr %Other
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %1 = load i64, ptr %Other
+  %cmp = icmp ult i64 %1, 32
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %2 = load i64, ptr %Visible
+  %3 = load i64, ptr %TangleSize.addr
+  %cmp7 = icmp eq i64 %2, %3
+  %BarrierAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast, i64 %4) 
+  %call9 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc, ptr byval(%"range") %agg.tmp8) 
+  %storedv = zext i1 %cmp7 to i8
+  store i8 %storedv, ptr addrspace(4) %call9, align 1
+  %5 = getelementptr inbounds i8, ptr addrspace(4) %this1, i64 64
+  %call10 = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %5) 
+  store i32 %call10, ptr %OriginalLID, align 4
+  %call11 = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast) 
+  store i32 %call11, ptr %LID, align 4
+  %6 = load i32, ptr %OriginalLID, align 4
+  %call13 = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12, i32 %6, i32 0) 
+  store i32 %call13, ptr %BroadcastResult, align 4
+  %7 = load i32, ptr %BroadcastResult, align 4
+  %conv = zext i32 %7 to i64
+  %8 = load i64, ptr %TangleLeader.addr
+  %cmp14 = icmp eq i64 %conv, %8
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 3
+  %9 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast, i64 %9) 
+  %call16 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc, ptr byval(%"range") %agg.tmp15) 
+  %storedv17 = zext i1 %cmp14 to i8
+  store i8 %storedv17, ptr addrspace(4) %call16, align 1
+  %10 = load i32, ptr %LID, align 4
+  %cmp19 = icmp eq i32 %10, 0
+  %call20 = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18, i1 zeroext %cmp19) 
+  %storedv21 = zext i1 %call20 to i8
+  store i8 %storedv21, ptr %AnyResult, align 1
+  %11 = load i8, ptr %AnyResult, align 1  
+  %loadedv = trunc i8 %11 to i1
+  %conv22 = zext i1 %loadedv to i32
+  %cmp23 = icmp eq i32 %conv22, 1
+  %AnyAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 4
+  %12 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast, i64 %12) 
+  %call25 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc, ptr byval(%"range") %agg.tmp24) 
+  %storedv26 = zext i1 %cmp23 to i8
+  store i8 %storedv26, ptr addrspace(4) %call25, align 1
+  %13 = load i32, ptr %LID, align 4
+  %conv28 = zext i32 %13 to i64
+  %14 = load i64, ptr %TangleSize.addr
+  %cmp29 = icmp ult i64 %conv28, %14
+  %call30 = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27, i1 zeroext %cmp29) 
+  %storedv31 = zext i1 %call30 to i8
+  store i8 %storedv31, ptr %AllResult, align 1
+  %15 = load i8, ptr %AllResult, align 1  
+  %loadedv32 = trunc i8 %15 to i1
+  %conv33 = zext i1 %loadedv32 to i32
+  %cmp34 = icmp eq i32 %conv33, 1
+  %AllAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 5
+  %16 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast, i64 %16) 
+  %call36 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc, ptr byval(%"range") %agg.tmp35) 
+  %storedv37 = zext i1 %cmp34 to i8
+  store i8 %storedv37, ptr addrspace(4) %call36, align 1
+  %17 = load i32, ptr %LID, align 4
+  %conv39 = zext i32 %17 to i64
+  %18 = load i64, ptr %TangleSize.addr
+  %cmp40 = icmp uge i64 %conv39, %18
+  %call41 = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38, i1 zeroext %cmp40) 
+  %storedv42 = zext i1 %call41 to i8
+  store i8 %storedv42, ptr %NoneResult, align 1
+  %19 = load i8, ptr %NoneResult, align 1  
+  %loadedv43 = trunc i8 %19 to i1
+  %conv44 = zext i1 %loadedv43 to i32
+  %cmp45 = icmp eq i32 %conv44, 1
+  %NoneAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 6
+  %20 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast, i64 %20) 
+  %call47 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc, ptr byval(%"range") %agg.tmp46) 
+  %storedv48 = zext i1 %cmp45 to i8
+  store i8 %storedv48, ptr addrspace(4) %call47, align 1
+  %call51 = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50) 
+  store i32 %call51, ptr %ReduceResult, align 4
+  %21 = load i32, ptr %ReduceResult, align 4
+  %conv52 = zext i32 %21 to i64
+  %22 = load i64, ptr %TangleSize.addr
+  %cmp53 = icmp eq i64 %conv52, %22
+  %ReduceAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 7
+  %23 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast, i64 %23) 
+  %call55 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc, ptr byval(%"range") %agg.tmp54) 
+  %storedv56 = zext i1 %cmp53 to i8
+  store i8 %storedv56, ptr addrspace(4) %call55, align 1
+  %call59 = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58) 
+  store i32 %call59, ptr %ExScanResult, align 4
+  %24 = load i32, ptr %ExScanResult, align 4
+  %25 = load i32, ptr %LID, align 4
+  %cmp60 = icmp eq i32 %24, %25
+  %ExScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 8
+  %26 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast, i64 %26) 
+  %call62 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc, ptr byval(%"range") %agg.tmp61) 
+  %storedv63 = zext i1 %cmp60 to i8
+  store i8 %storedv63, ptr addrspace(4) %call62, align 1
+  %call66 = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65) 
+  store i32 %call66, ptr %IncScanResult, align 4
+  %27 = load i32, ptr %IncScanResult, align 4
+  %28 = load i32, ptr %LID, align 4
+  %add67 = add i32 %28, 1
+  %cmp68 = icmp eq i32 %27, %add67
+  %IncScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 9
+  %29 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast, i64 %29) 
+  %call70 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc, ptr byval(%"range") %agg.tmp69) 
+  %storedv71 = zext i1 %cmp68 to i8
+  store i8 %storedv71, ptr addrspace(4) %call70, align 1
+  %30 = load i32, ptr %LID, align 4
+  %call73 = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72, i32 %30, i32 2) 
+  store i32 %call73, ptr %ShiftLeftResult, align 4
+  %31 = load i32, ptr %LID, align 4
+  %add74 = add i32 %31, 2
+  %conv75 = zext i32 %add74 to i64
+  %32 = load i64, ptr %TangleSize.addr
+  %cmp76 = icmp uge i64 %conv75, %32
+  br i1 %cmp76, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %for.cond.cleanup
+  %33 = load i32, ptr %ShiftLeftResult, align 4
+  %34 = load i32, ptr %LID, align 4
+  %add77 = add i32 %34, 2
+  %cmp78 = icmp eq i32 %33, %add77
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %for.cond.cleanup
+  %35 = phi i1 [ true, %for.cond.cleanup ], [ %cmp78, %lor.rhs ]
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 10
+  %36 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast, i64 %36) 
+  %call80 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc, ptr byval(%"range") %agg.tmp79) 
+  %storedv81 = zext i1 %35 to i8
+  store i8 %storedv81, ptr addrspace(4) %call80, align 1
+  %37 = load i32, ptr %LID, align 4
+  %call83 = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82, i32 %37, i32 2) 
+  store i32 %call83, ptr %ShiftRightResult, align 4
+  %38 = load i32, ptr %LID, align 4
+  %cmp84 = icmp ult i32 %38, 2
+  br i1 %cmp84, label %lor.end87, label %lor.rhs85
+
+lor.rhs85:                                        ; preds = %lor.end
+  %39 = load i32, ptr %ShiftRightResult, align 4
+  %40 = load i32, ptr %LID, align 4
+  %sub = sub i32 %40, 2
+  %cmp86 = icmp eq i32 %39, %sub
+  br label %lor.end87
+
+lor.end87:                                        ; preds = %lor.rhs85, %lor.end
+  %41 = phi i1 [ true, %lor.end ], [ %cmp86, %lor.rhs85 ]
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 11
+  %42 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast, i64 %42) 
+  %call89 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc, ptr byval(%"range") %agg.tmp88) 
+  %storedv90 = zext i1 %41 to i8
+  store i8 %storedv90, ptr addrspace(4) %call89, align 1
+  %43 = load i32, ptr %LID, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast, ptr addrspace(4) %Tangle.ascast) 
+  store i32 2, ptr %ref.tmp94, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %ref.tmp93.ascast, ptr addrspace(4) align 4 %ref.tmp94.ascast) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %TangleSize.addr.ascast) 
+  %call95 = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91, i32 %43, ptr byval(%"range") %agg.tmp92) 
+  store i32 %call95, ptr %SelectResult, align 4
+  %44 = load i32, ptr %SelectResult, align 4
+  %conv96 = zext i32 %44 to i64
+  %45 = load i32, ptr %LID, align 4
+  %add97 = add i32 %45, 2
+  %conv98 = zext i32 %add97 to i64
+  %46 = load i64, ptr %TangleSize.addr
+  %rem = urem i64 %conv98, %46
+  %cmp99 = icmp eq i64 %conv96, %rem
+  %SelectAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 12
+  %47 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast, i64 %47) 
+  %call101 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc, ptr byval(%"range") %agg.tmp100) 
+  %storedv102 = zext i1 %cmp99 to i8
+  store i8 %storedv102, ptr addrspace(4) %call101, align 1
+  %48 = load i32, ptr %LID, align 4
+  %call104 = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103, i32 %48, i32 2) 
+  store i32 %call104, ptr %PermuteXorResult, align 4
+  %49 = load i32, ptr %PermuteXorResult, align 4
+  %50 = load i32, ptr %LID, align 4
+  %xor = xor i32 %50, 2
+  %cmp105 = icmp eq i32 %49, %xor
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 13
+  %51 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast, i64 %51) 
+  %call107 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc, ptr byval(%"range") %agg.tmp106) 
+  %storedv108 = zext i1 %cmp105 to i8
+  store i8 %storedv108, ptr addrspace(4) %call107, align 1
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %52 = load i64, ptr %Other
+  %call3 = call spir_func zeroext i1 @Foo74(ptr addrspace(4) align 1 %IsMember.ascast, i64 %52) 
+  br i1 %call3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %TmpAcc42 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %53 = load i64, ptr %Other
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast, i64 %53) 
+  %call6 = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc42, ptr byval(%"range") %agg.tmp5) 
+  %54 = load i64, ptr addrspace(4) %call6
+  %55 = load i64, ptr %Visible
+  %add = add i64 %55, %54
+  store i64 %add, ptr %Visible
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %56 = load i64, ptr %Other
+  %inc = add i64 %56, 1
+  store i64 %inc, ptr %Other
+  br label %for.cond
+}
+
+
+define internal spir_func void @Foo69(ptr addrspace(4) %this, i64 %WI, ptr byval(%"tangle_group") %Tangle, i64 %TangleLeader, i64 %TangleSize, ptr byval(%"nd_item") align 1 %IsMember) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %WI.addr = alloca i64
+  %TangleLeader.addr = alloca i64
+  %TangleSize.addr = alloca i64
+  %agg.tmp = alloca %"range"
+  %agg.tmp2 = alloca %"tangle_group"
+  %Visible = alloca i64
+  %Other = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %agg.tmp5 = alloca %"range"
+  %agg.tmp8 = alloca %"range"
+  %OriginalLID = alloca i32, align 4
+  %LID = alloca i32, align 4
+  %BroadcastResult = alloca i32, align 4
+  %agg.tmp12 = alloca %"tangle_group"
+  %agg.tmp15 = alloca %"range"
+  %AnyResult = alloca i8, align 1
+  %agg.tmp18 = alloca %"tangle_group"
+  %agg.tmp24 = alloca %"range"
+  %AllResult = alloca i8, align 1
+  %agg.tmp27 = alloca %"tangle_group"
+  %agg.tmp35 = alloca %"range"
+  %NoneResult = alloca i8, align 1
+  %agg.tmp38 = alloca %"tangle_group"
+  %agg.tmp46 = alloca %"range"
+  %ReduceResult = alloca i32, align 4
+  %agg.tmp49 = alloca %"tangle_group"
+  %agg.tmp50 = alloca %"nd_item", align 1
+  %agg.tmp54 = alloca %"range"
+  %ExScanResult = alloca i32, align 4
+  %agg.tmp57 = alloca %"tangle_group"
+  %agg.tmp58 = alloca %"nd_item", align 1
+  %agg.tmp61 = alloca %"range"
+  %IncScanResult = alloca i32, align 4
+  %agg.tmp64 = alloca %"tangle_group"
+  %agg.tmp65 = alloca %"nd_item", align 1
+  %agg.tmp69 = alloca %"range"
+  %ShiftLeftResult = alloca i32, align 4
+  %agg.tmp72 = alloca %"tangle_group"
+  %agg.tmp79 = alloca %"range"
+  %ShiftRightResult = alloca i32, align 4
+  %agg.tmp82 = alloca %"tangle_group"
+  %agg.tmp88 = alloca %"range"
+  %SelectResult = alloca i32, align 4
+  %agg.tmp91 = alloca %"tangle_group"
+  %agg.tmp92 = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %ref.tmp93 = alloca %"range"
+  %ref.tmp94 = alloca i32, align 4
+  %agg.tmp100 = alloca %"range"
+  %PermuteXorResult = alloca i32, align 4
+  %agg.tmp103 = alloca %"tangle_group"
+  %agg.tmp106 = alloca %"range"
+  %TangleSize.addr.ascast = addrspacecast ptr %TangleSize.addr to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %agg.tmp5.ascast = addrspacecast ptr %agg.tmp5 to ptr addrspace(4)
+  %agg.tmp8.ascast = addrspacecast ptr %agg.tmp8 to ptr addrspace(4)
+  %agg.tmp15.ascast = addrspacecast ptr %agg.tmp15 to ptr addrspace(4)
+  %agg.tmp24.ascast = addrspacecast ptr %agg.tmp24 to ptr addrspace(4)
+  %agg.tmp35.ascast = addrspacecast ptr %agg.tmp35 to ptr addrspace(4)
+  %agg.tmp46.ascast = addrspacecast ptr %agg.tmp46 to ptr addrspace(4)
+  %agg.tmp54.ascast = addrspacecast ptr %agg.tmp54 to ptr addrspace(4)
+  %agg.tmp61.ascast = addrspacecast ptr %agg.tmp61 to ptr addrspace(4)
+  %agg.tmp69.ascast = addrspacecast ptr %agg.tmp69 to ptr addrspace(4)
+  %agg.tmp79.ascast = addrspacecast ptr %agg.tmp79 to ptr addrspace(4)
+  %agg.tmp88.ascast = addrspacecast ptr %agg.tmp88 to ptr addrspace(4)
+  %agg.tmp92.ascast = addrspacecast ptr %agg.tmp92 to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %ref.tmp93.ascast = addrspacecast ptr %ref.tmp93 to ptr addrspace(4)
+  %ref.tmp94.ascast = addrspacecast ptr %ref.tmp94 to ptr addrspace(4)
+  %agg.tmp100.ascast = addrspacecast ptr %agg.tmp100 to ptr addrspace(4)
+  %agg.tmp106.ascast = addrspacecast ptr %agg.tmp106 to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %WI, ptr %WI.addr
+  %Tangle.ascast = addrspacecast ptr %Tangle to ptr addrspace(4)
+  store i64 %TangleLeader, ptr %TangleLeader.addr
+  store i64 %TangleSize, ptr %TangleSize.addr
+  %IsMember.ascast = addrspacecast ptr %IsMember to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %TmpAcc1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %0) 
+  %call = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc1, ptr byval(%"range") %agg.tmp) 
+  store i64 1, ptr addrspace(4) %call
+  call spir_func void @Foo75(ptr byval(%"tangle_group") %agg.tmp2, i32 1) 
+  store i64 0, ptr %Visible
+  store i64 0, ptr %Other
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %1 = load i64, ptr %Other
+  %cmp = icmp ult i64 %1, 32
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %2 = load i64, ptr %Visible
+  %3 = load i64, ptr %TangleSize.addr
+  %cmp7 = icmp eq i64 %2, %3
+  %BarrierAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 1
+  %4 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp8.ascast, i64 %4) 
+  %call9 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BarrierAcc, ptr byval(%"range") %agg.tmp8) 
+  %storedv = zext i1 %cmp7 to i8
+  store i8 %storedv, ptr addrspace(4) %call9, align 1
+  %5 = getelementptr inbounds i8, ptr addrspace(4) %this1, i64 64
+  %call10 = call spir_func i32 @Foo76(ptr addrspace(4) align 1 %5) 
+  store i32 %call10, ptr %OriginalLID, align 4
+  %call11 = call spir_func i32 @Foo90(ptr addrspace(4) %Tangle.ascast) 
+  store i32 %call11, ptr %LID, align 4
+  %6 = load i32, ptr %OriginalLID, align 4
+  %call13 = call spir_func i32 @Foo91(ptr byval(%"tangle_group") %agg.tmp12, i32 %6, i32 0) 
+  store i32 %call13, ptr %BroadcastResult, align 4
+  %7 = load i32, ptr %BroadcastResult, align 4
+  %conv = zext i32 %7 to i64
+  %8 = load i64, ptr %TangleLeader.addr
+  %cmp14 = icmp eq i64 %conv, %8
+  %BroadcastAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 3
+  %9 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp15.ascast, i64 %9) 
+  %call16 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %BroadcastAcc, ptr byval(%"range") %agg.tmp15) 
+  %storedv17 = zext i1 %cmp14 to i8
+  store i8 %storedv17, ptr addrspace(4) %call16, align 1
+  %10 = load i32, ptr %LID, align 4
+  %cmp19 = icmp eq i32 %10, 0
+  %call20 = call spir_func zeroext i1 @Foo92(ptr byval(%"tangle_group") %agg.tmp18, i1 zeroext %cmp19) 
+  %storedv21 = zext i1 %call20 to i8
+  store i8 %storedv21, ptr %AnyResult, align 1
+  %11 = load i8, ptr %AnyResult, align 1  
+  %loadedv = trunc i8 %11 to i1
+  %conv22 = zext i1 %loadedv to i32
+  %cmp23 = icmp eq i32 %conv22, 1
+  %AnyAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 4
+  %12 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp24.ascast, i64 %12) 
+  %call25 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AnyAcc, ptr byval(%"range") %agg.tmp24) 
+  %storedv26 = zext i1 %cmp23 to i8
+  store i8 %storedv26, ptr addrspace(4) %call25, align 1
+  %13 = load i32, ptr %LID, align 4
+  %conv28 = zext i32 %13 to i64
+  %14 = load i64, ptr %TangleSize.addr
+  %cmp29 = icmp ult i64 %conv28, %14
+  %call30 = call spir_func zeroext i1 @Foo67(ptr byval(%"tangle_group") %agg.tmp27, i1 zeroext %cmp29) 
+  %storedv31 = zext i1 %call30 to i8
+  store i8 %storedv31, ptr %AllResult, align 1
+  %15 = load i8, ptr %AllResult, align 1  
+  %loadedv32 = trunc i8 %15 to i1
+  %conv33 = zext i1 %loadedv32 to i32
+  %cmp34 = icmp eq i32 %conv33, 1
+  %AllAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 5
+  %16 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp35.ascast, i64 %16) 
+  %call36 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %AllAcc, ptr byval(%"range") %agg.tmp35) 
+  %storedv37 = zext i1 %cmp34 to i8
+  store i8 %storedv37, ptr addrspace(4) %call36, align 1
+  %17 = load i32, ptr %LID, align 4
+  %conv39 = zext i32 %17 to i64
+  %18 = load i64, ptr %TangleSize.addr
+  %cmp40 = icmp uge i64 %conv39, %18
+  %call41 = call spir_func zeroext i1 @Foo65(ptr byval(%"tangle_group") %agg.tmp38, i1 zeroext %cmp40) 
+  %storedv42 = zext i1 %call41 to i8
+  store i8 %storedv42, ptr %NoneResult, align 1
+  %19 = load i8, ptr %NoneResult, align 1  
+  %loadedv43 = trunc i8 %19 to i1
+  %conv44 = zext i1 %loadedv43 to i32
+  %cmp45 = icmp eq i32 %conv44, 1
+  %NoneAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 6
+  %20 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp46.ascast, i64 %20) 
+  %call47 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %NoneAcc, ptr byval(%"range") %agg.tmp46) 
+  %storedv48 = zext i1 %cmp45 to i8
+  store i8 %storedv48, ptr addrspace(4) %call47, align 1
+  %call51 = call spir_func i32 @Foo64(ptr byval(%"tangle_group") %agg.tmp49, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp50) 
+  store i32 %call51, ptr %ReduceResult, align 4
+  %21 = load i32, ptr %ReduceResult, align 4
+  %conv52 = zext i32 %21 to i64
+  %22 = load i64, ptr %TangleSize.addr
+  %cmp53 = icmp eq i64 %conv52, %22
+  %ReduceAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 7
+  %23 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp54.ascast, i64 %23) 
+  %call55 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ReduceAcc, ptr byval(%"range") %agg.tmp54) 
+  %storedv56 = zext i1 %cmp53 to i8
+  store i8 %storedv56, ptr addrspace(4) %call55, align 1
+  %call59 = call spir_func i32 @Foo63(ptr byval(%"tangle_group") %agg.tmp57, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp58) 
+  store i32 %call59, ptr %ExScanResult, align 4
+  %24 = load i32, ptr %ExScanResult, align 4
+  %25 = load i32, ptr %LID, align 4
+  %cmp60 = icmp eq i32 %24, %25
+  %ExScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 8
+  %26 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp61.ascast, i64 %26) 
+  %call62 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ExScanAcc, ptr byval(%"range") %agg.tmp61) 
+  %storedv63 = zext i1 %cmp60 to i8
+  store i8 %storedv63, ptr addrspace(4) %call62, align 1
+  %call66 = call spir_func i32 @Foo62(ptr byval(%"tangle_group") %agg.tmp64, i32 1, ptr byval(%"nd_item") align 1 %agg.tmp65) 
+  store i32 %call66, ptr %IncScanResult, align 4
+  %27 = load i32, ptr %IncScanResult, align 4
+  %28 = load i32, ptr %LID, align 4
+  %add67 = add i32 %28, 1
+  %cmp68 = icmp eq i32 %27, %add67
+  %IncScanAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 9
+  %29 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp69.ascast, i64 %29) 
+  %call70 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %IncScanAcc, ptr byval(%"range") %agg.tmp69) 
+  %storedv71 = zext i1 %cmp68 to i8
+  store i8 %storedv71, ptr addrspace(4) %call70, align 1
+  %30 = load i32, ptr %LID, align 4
+  %call73 = call spir_func i32 @Foo73(ptr byval(%"tangle_group") %agg.tmp72, i32 %30, i32 2) 
+  store i32 %call73, ptr %ShiftLeftResult, align 4
+  %31 = load i32, ptr %LID, align 4
+  %add74 = add i32 %31, 2
+  %conv75 = zext i32 %add74 to i64
+  %32 = load i64, ptr %TangleSize.addr
+  %cmp76 = icmp uge i64 %conv75, %32
+  br i1 %cmp76, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %for.cond.cleanup
+  %33 = load i32, ptr %ShiftLeftResult, align 4
+  %34 = load i32, ptr %LID, align 4
+  %add77 = add i32 %34, 2
+  %cmp78 = icmp eq i32 %33, %add77
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %for.cond.cleanup
+  %35 = phi i1 [ true, %for.cond.cleanup ], [ %cmp78, %lor.rhs ]
+  %ShiftLeftAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 10
+  %36 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp79.ascast, i64 %36) 
+  %call80 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftLeftAcc, ptr byval(%"range") %agg.tmp79) 
+  %storedv81 = zext i1 %35 to i8
+  store i8 %storedv81, ptr addrspace(4) %call80, align 1
+  %37 = load i32, ptr %LID, align 4
+  %call83 = call spir_func i32 @Foo53(ptr byval(%"tangle_group") %agg.tmp82, i32 %37, i32 2) 
+  store i32 %call83, ptr %ShiftRightResult, align 4
+  %38 = load i32, ptr %LID, align 4
+  %cmp84 = icmp ult i32 %38, 2
+  br i1 %cmp84, label %lor.end87, label %lor.rhs85
+
+lor.rhs85:                                        ; preds = %lor.end
+  %39 = load i32, ptr %ShiftRightResult, align 4
+  %40 = load i32, ptr %LID, align 4
+  %sub = sub i32 %40, 2
+  %cmp86 = icmp eq i32 %39, %sub
+  br label %lor.end87
+
+lor.end87:                                        ; preds = %lor.rhs85, %lor.end
+  %41 = phi i1 [ true, %lor.end ], [ %cmp86, %lor.rhs85 ]
+  %ShiftRightAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 11
+  %42 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp88.ascast, i64 %42) 
+  %call89 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %ShiftRightAcc, ptr byval(%"range") %agg.tmp88) 
+  %storedv90 = zext i1 %41 to i8
+  store i8 %storedv90, ptr addrspace(4) %call89, align 1
+  %43 = load i32, ptr %LID, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp93.ascast, ptr addrspace(4) %Tangle.ascast) 
+  store i32 2, ptr %ref.tmp94, align 4
+  call spir_func void @Foo55(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %ref.tmp93.ascast, ptr addrspace(4) align 4 %ref.tmp94.ascast) 
+  call spir_func void @Foo56(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.tmp92.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %TangleSize.addr.ascast) 
+  %call95 = call spir_func i32 @Foo57(ptr byval(%"tangle_group") %agg.tmp91, i32 %43, ptr byval(%"range") %agg.tmp92) 
+  store i32 %call95, ptr %SelectResult, align 4
+  %44 = load i32, ptr %SelectResult, align 4
+  %conv96 = zext i32 %44 to i64
+  %45 = load i32, ptr %LID, align 4
+  %add97 = add i32 %45, 2
+  %conv98 = zext i32 %add97 to i64
+  %46 = load i64, ptr %TangleSize.addr
+  %rem = urem i64 %conv98, %46
+  %cmp99 = icmp eq i64 %conv96, %rem
+  %SelectAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 12
+  %47 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp100.ascast, i64 %47) 
+  %call101 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %SelectAcc, ptr byval(%"range") %agg.tmp100) 
+  %storedv102 = zext i1 %cmp99 to i8
+  store i8 %storedv102, ptr addrspace(4) %call101, align 1
+  %48 = load i32, ptr %LID, align 4
+  %call104 = call spir_func i32 @Foo58(ptr byval(%"tangle_group") %agg.tmp103, i32 %48, i32 2) 
+  store i32 %call104, ptr %PermuteXorResult, align 4
+  %49 = load i32, ptr %PermuteXorResult, align 4
+  %50 = load i32, ptr %LID, align 4
+  %xor = xor i32 %50, 2
+  %cmp105 = icmp eq i32 %49, %xor
+  %PermuteXorAcc = getelementptr inbounds nuw %class.anon.8, ptr addrspace(4) %this1, i32 0, i32 13
+  %51 = load i64, ptr %WI.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp106.ascast, i64 %51) 
+  %call107 = call spir_func align 1 ptr addrspace(4) @Foo54(ptr addrspace(4) %PermuteXorAcc, ptr byval(%"range") %agg.tmp106) 
+  %storedv108 = zext i1 %cmp105 to i8
+  store i8 %storedv108, ptr addrspace(4) %call107, align 1
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %52 = load i64, ptr %Other
+  %call3 = call spir_func zeroext i1 @Bar14(ptr addrspace(4) align 1 %IsMember.ascast, i64 %52) 
+  br i1 %call3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %TmpAcc42 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %53 = load i64, ptr %Other
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp5.ascast, i64 %53) 
+  %call6 = call spir_func ptr addrspace(4) @Foo70(ptr addrspace(4) %TmpAcc42, ptr byval(%"range") %agg.tmp5) 
+  %54 = load i64, ptr addrspace(4) %call6
+  %55 = load i64, ptr %Visible
+  %add = add i64 %55, %54
+  store i64 %add, ptr %Visible
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %56 = load i64, ptr %Other
+  %inc = add i64 %56, 1
+  store i64 %inc, ptr %Other
+  br label %for.cond
+}
+
+
+define internal spir_func zeroext i1 @Bar14(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp uge i64 %0, 24
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i64, ptr %Other.addr
+  %cmp2 = icmp ult i64 %1, 32
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %cmp2, %land.rhs ]
+  ret i1 %2
+}
+
+
+define internal spir_func zeroext i1 @Foo74(ptr addrspace(4) align 1 %this, i64 %Other) {
+entry:
+  %retval = alloca i1, align 1
+  %this.addr = alloca ptr addrspace(4)
+  %Other.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %Other, ptr %Other.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %Other.addr
+  %cmp = icmp uge i64 %0, 4
+  br i1 %cmp, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %1 = load i64, ptr %Other.addr
+  %cmp2 = icmp ult i64 %1, 24
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %cmp2, %land.rhs ]
+  ret i1 %2
+}
+
+
+define internal spir_func i32 @Bar13(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %mask) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %ref.tmp = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %mask.ascast = addrspacecast ptr %mask to ptr addrspace(4)
+  %0 = addrspacecast ptr addrspace(1) @_ZSt6ignore to ptr addrspace(4)
+  %call = call spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %0, ptr addrspace(4) %g.ascast) 
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %g.ascast) 
+  call spir_func void @Bar16(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %ref.tmp.ascast, ptr addrspace(4) %mask.ascast) 
+  %call2 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp1) 
+  store i32 %call2, ptr %TargetId, align 4
+  %call3 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %1 = load i32, ptr %TargetId, align 4
+  %call4 = call spir_func i32 @Foo50(i32 3, i32 %call3, i32 %1) 
+  ret i32 %call4
+}
+
+
+define internal spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %this, ptr addrspace(4) %0) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %0, ptr %.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret ptr addrspace(4) %this1
+}
+
+
+define internal spir_func void @Bar16(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %lhs, ptr addrspace(4) %rhs) {
+entry:
+  %lhs.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca ptr addrspace(4)
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %lhs, ptr %lhs.addr
+  store ptr addrspace(4) %rhs, ptr %rhs.addr
+  call spir_func void @Foo11(ptr addrspace(4) %agg.result) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load ptr addrspace(4), ptr %lhs.addr
+  %common_array2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %2 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array2, i64 0, i64 %idxprom
+  %3 = load i64, ptr addrspace(4) %arrayidx
+  %4 = load ptr addrspace(4), ptr %rhs.addr
+  %common_array13 = bitcast ptr addrspace(4) %4 to ptr addrspace(4)
+  %5 = load i32, ptr %i, align 4
+  %idxprom2 = sext i32 %5 to i64
+  %arrayidx3 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array13, i64 0, i64 %idxprom2
+  %6 = load i64, ptr addrspace(4) %arrayidx3
+  %xor = xor i64 %3, %6
+  %common_array44 = bitcast ptr addrspace(4) %agg.result to ptr addrspace(4)
+  %7 = load i32, ptr %i, align 4
+  %idxprom5 = sext i32 %7 to i64
+  %arrayidx6 = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array44, i64 0, i64 %idxprom5
+  store i64 %xor, ptr addrspace(4) %arrayidx6
+  %8 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func i32 @Foo48(ptr byval(%"tangle_group") %g, ptr byval(%"range") %local_id) {
+entry:
+  %retval.i = alloca i64
+  %this.addr.i = alloca ptr addrspace(4)
+  %Result.i = alloca i64
+  %retval = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %local_id.ascast = addrspacecast ptr %local_id to ptr addrspace(4)
+  %retval.ascast.i = addrspacecast ptr %retval.i to ptr addrspace(4)
+  store ptr addrspace(4) %local_id.ascast, ptr %this.addr.i
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %0 = load i64, ptr addrspace(4) %this1.i
+  store i64 %0, ptr %Result.i
+  %1 = load i64, ptr %Result.i
+  %conv = trunc i64 %1 to i32
+  %call1 = call spir_func i32 @Bar17(ptr byval(%"tangle_group") %agg.tmp, i32 %conv) 
+  ret i32 %call1
+}
+
+
+define internal spir_func i32 @Foo49(ptr addrspace(4) align 4 %x) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %x, ptr %x.addr
+  %0 = load ptr addrspace(4), ptr %x.addr
+  %1 = load i32, ptr addrspace(4) %0, align 4
+  ret i32 %1
+}
+
+declare dso_local spir_func i32 @Foo50(i32, i32, i32) 
+
+
+define internal spir_func i32 @Bar17(ptr byval(%"tangle_group") %Group, i32 %Id) {
+entry:
+  %retval = alloca i32, align 4
+  %Id.addr = alloca i32, align 4
+  %MemberMask = alloca %"vec.16", align 16
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  %agg.tmp1 = alloca %"tangle_group"
+  %Count = alloca i32, align 4
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %b = alloca i32, align 4
+  %MemberMask.ascast = addrspacecast ptr %MemberMask to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store i32 %Id, ptr %Id.addr, align 4
+  call spir_func void @Bar18(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %agg.tmp.ascast, ptr byval(%"tangle_group") %agg.tmp1) 
+  call spir_func void @Bar19(ptr addrspace(4) dead_on_unwind writable sret(%"vec.16") align 16 %MemberMask.ascast, ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  store i32 0, ptr %Count, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  store i32 2, ptr %cleanup.dest.slot, align 4
+  br label %cleanup12
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, ptr %b, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %if.end8, %for.body
+  %1 = load i32, ptr %b, align 4
+  %cmp3 = icmp slt i32 %1, 32
+  br i1 %cmp3, label %for.body5, label %for.cond.cleanup4
+
+for.cond.cleanup4:                                ; preds = %for.cond2
+  store i32 5, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+for.body5:                                        ; preds = %for.cond2
+  %2 = load i32, ptr %i, align 4
+  %call = call spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %MemberMask.ascast, i32 %2) 
+  %3 = load i32, ptr addrspace(4) %call, align 4
+  %4 = load i32, ptr %b, align 4
+  %shl = shl i32 1, %4
+  %and = and i32 %3, %shl
+  %tobool = icmp ne i32 %and, 0
+  br i1 %tobool, label %if.then, label %if.end8
+
+if.then:                                          ; preds = %for.body5
+  %5 = load i32, ptr %Count, align 4
+  %6 = load i32, ptr %Id.addr, align 4
+  %cmp6 = icmp eq i32 %5, %6
+  br i1 %cmp6, label %if.then7, label %if.end
+
+if.end:                                           ; preds = %if.then
+  %7 = load i32, ptr %Count, align 4
+  %inc = add i32 %7, 1
+  store i32 %inc, ptr %Count, align 4
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end, %for.body5
+  %8 = load i32, ptr %b, align 4
+  %inc9 = add nsw i32 %8, 1
+  store i32 %inc9, ptr %b, align 4
+  br label %for.cond2
+
+if.then7:                                         ; preds = %if.then
+  %9 = load i32, ptr %i, align 4
+  %mul = mul nsw i32 %9, 32
+  %10 = load i32, ptr %b, align 4
+  %add = add nsw i32 %mul, %10
+  store i32 %add, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.then7, %for.cond.cleanup4
+  %cleanup.dest = load i32, ptr %cleanup.dest.slot, align 4
+  %cond = icmp eq i32 %cleanup.dest, 5
+  br i1 %cond, label %for.end, label %cleanup12
+
+for.end:                                          ; preds = %cleanup
+  %11 = load i32, ptr %i, align 4
+  %inc11 = add nsw i32 %11, 1
+  store i32 %inc11, ptr %i, align 4
+  br label %for.cond
+
+cleanup12:                                        ; preds = %cleanup, %for.cond.cleanup
+  %cleanup.dest13 = load i32, ptr %cleanup.dest.slot, align 4
+  %cond1 = icmp eq i32 %cleanup.dest13, 2
+  br i1 %cond1, label %for.end14, label %cleanup15
+
+for.end14:                                        ; preds = %cleanup12
+  %12 = load i32, ptr %Count, align 4
+  store i32 %12, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup15
+
+cleanup15:                                        ; preds = %cleanup12, %for.end14
+  %13 = load i32, ptr %retval, align 4
+  ret i32 %13
+}
+
+
+define internal spir_func void @Bar18(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, ptr byval(%"tangle_group") %Group) {
+entry:
+  %Mask1 = bitcast ptr %Group to ptr
+  ret void
+}
+
+
+define internal spir_func void @Bar19(ptr addrspace(4) dead_on_unwind noalias writable sret(%"vec.16") align 16 %agg.result, ptr byval(%"ss_sub_group_mask") %Mask) {
+entry:
+  %TmpMArray = alloca %"struct.std::array.20", align 4
+  %agg.tmp = alloca %"range"
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %cleanup.dest.slot2 = alloca i32, align 4
+  %TmpMArray.ascast = addrspacecast ptr %TmpMArray to ptr addrspace(4)
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  %Mask.ascast = addrspacecast ptr %Mask to ptr addrspace(4)
+  call spir_func void @Bar50(ptr addrspace(4) align 4 %TmpMArray.ascast) 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 0) 
+  call spir_func void @Bar51(ptr addrspace(4) %Mask.ascast, ptr addrspace(4) align 4 %TmpMArray.ascast, ptr byval(%"range") %agg.tmp) 
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %0, 4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, ptr %i, align 4
+  %conv = sext i32 %1 to i64
+  %call = call spir_func align 4 ptr addrspace(4) @Bar57(ptr addrspace(4) align 4 %TmpMArray.ascast, i64 %conv) 
+  %2 = load i32, ptr addrspace(4) %call, align 4
+  %3 = load i32, ptr %i, align 4
+  %call1 = call spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %agg.result, i32 %3) 
+  store i32 %2, ptr addrspace(4) %call1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @Bar20(ptr addrspace(4) align 16 %this, i32 %i) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %i.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %i, ptr %i.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %m_Data1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i32, ptr %i.addr, align 4
+  %conv = sext i32 %0 to i64
+  %call = call spir_func align 4 ptr addrspace(4) @_ZNSt5arrayIjLm4EEixEm(ptr addrspace(4) align 4 %m_Data1, i64 %conv) 
+  ret ptr addrspace(4) %call
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @_ZNSt5arrayIjLm4EEixEm(ptr addrspace(4) align 4 %this, i64 %__n) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %__n.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %__n, ptr %__n.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %_M_elems1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %__n.addr
+  %call = call spir_func align 4 ptr addrspace(4) @_ZNSt14__array_traitsIjLm4EE6_S_refERA4_Kjm(ptr addrspace(4) align 4 %_M_elems1, i64 %0) 
+  ret ptr addrspace(4) %call
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @_ZNSt14__array_traitsIjLm4EE6_S_refERA4_Kjm(ptr addrspace(4) align 4 %__t, i64 %__n) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %__t.addr = alloca ptr addrspace(4)
+  %__n.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %__t, ptr %__t.addr
+  store i64 %__n, ptr %__n.addr
+  %0 = load ptr addrspace(4), ptr %__t.addr
+  %1 = load i64, ptr %__n.addr
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr addrspace(4) %0, i64 0, i64 %1
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func void @Bar50(ptr addrspace(4) align 4 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = inttoptr i64 16 to ptr addrspace(4)
+  br label %arrayinit.body
+
+arrayinit.body:                                   ; preds = %arrayinit.body, %entry
+  %lsr.iv = phi i64 [ %lsr.iv.next, %arrayinit.body ], [ 0, %entry ]
+  %scevgep = getelementptr i8, ptr addrspace(4) %this1, i64 %lsr.iv
+  store i32 0, ptr addrspace(4) %scevgep, align 4
+  %lsr.iv.next = add nuw nsw i64 %lsr.iv, 4
+  %lsr.iv.next1 = inttoptr i64 %lsr.iv.next to ptr addrspace(4)
+  %arrayinit.done = icmp eq ptr addrspace(4) %lsr.iv.next1, %0
+  br i1 %arrayinit.done, label %arrayinit.end2, label %arrayinit.body
+
+arrayinit.end2:                                   ; preds = %arrayinit.body
+  ret void
+}
+
+
+define internal spir_func void @Bar51(ptr addrspace(4) %this, ptr addrspace(4) align 4 %bits, ptr byval(%"range") %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %bits.addr = alloca ptr addrspace(4)
+  %cur_pos = alloca i64
+  %__range4 = alloca ptr addrspace(4)
+  %__begin0 = alloca ptr addrspace(4)
+  %__end0 = alloca ptr addrspace(4)
+  %cleanup.dest.slot = alloca i32, align 4
+  %elem = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  %agg.tmp.ascast = addrspacecast ptr %agg.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %bits, ptr %bits.addr
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  store i64 %call, ptr %cur_pos
+  %0 = load ptr addrspace(4), ptr %bits.addr
+  store ptr addrspace(4) %0, ptr %__range4
+  %1 = load ptr addrspace(4), ptr %__range4
+  %call2 = call spir_func ptr addrspace(4) @Bar53(ptr addrspace(4) align 4 %1) 
+  store ptr addrspace(4) %call2, ptr %__begin0
+  %2 = load ptr addrspace(4), ptr %__range4
+  %call3 = call spir_func ptr addrspace(4) @Bar54(ptr addrspace(4) align 4 %2) 
+  store ptr addrspace(4) %call3, ptr %__end0
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %3 = load ptr addrspace(4), ptr %__begin0
+  %4 = load ptr addrspace(4), ptr %__end0
+  %cmp = icmp ne ptr addrspace(4) %3, %4
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+
+for.body:                                         ; preds = %for.cond
+  %5 = load ptr addrspace(4), ptr %__begin0
+  store ptr addrspace(4) %5, ptr %elem
+  %6 = load i64, ptr %cur_pos
+  %call4 = call spir_func i32 @Bar55(ptr addrspace(4) %this1) 
+  %conv = zext i32 %call4 to i64
+  %cmp5 = icmp ult i64 %6, %conv
+  br i1 %cmp5, label %if.then, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %7 = load ptr addrspace(4), ptr %elem
+  store i32 0, ptr addrspace(4) %7, align 4
+  br label %if.end
+
+if.then:                                          ; preds = %for.body
+  %8 = load ptr addrspace(4), ptr %elem
+  %9 = load i64, ptr %cur_pos
+  call spir_func void @Foo46(ptr addrspace(4) %agg.tmp.ascast, i64 %9) 
+  call spir_func void @Bar56(ptr addrspace(4) %this1, ptr addrspace(4) align 4 %8, ptr byval(%"range") %agg.tmp) 
+  %10 = load i64, ptr %cur_pos
+  %add = add i64 %10, 32
+  store i64 %add, ptr %cur_pos
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load ptr addrspace(4), ptr %__begin0
+  %incdec.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %11, i32 1
+  store ptr addrspace(4) %incdec.ptr, ptr %__begin0
+  br label %for.cond
+}
+
+
+define internal spir_func align 4 ptr addrspace(4) @Bar57(ptr addrspace(4) align 4 %this, i64 %index) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %index.addr = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %index, ptr %index.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %index.addr
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr addrspace(4) %MData1, i64 0, i64 %0
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func i64 @Bar52(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx
+  ret i64 %2
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar53(ptr addrspace(4) align 4 %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+
+
+  %this.addr1 = bitcast ptr %this.addr to ptr
+  %this.addr2 = bitcast ptr %this.addr1 to ptr
+  %this1 = load ptr addrspace(4), ptr %this.addr2
+
+
+
+;  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  ret ptr addrspace(4) %arraydecay2
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar54(ptr addrspace(4) align 4 %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+
+
+  %this.addr1 = bitcast ptr %this.addr to ptr
+  %this.addr2 = bitcast ptr %this.addr1 to ptr
+  %this1 = load ptr addrspace(4), ptr %this.addr2
+
+;  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MData1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %arraydecay2 = bitcast ptr addrspace(4) %MData1 to ptr addrspace(4)
+  %add.ptr = getelementptr inbounds nuw i32, ptr addrspace(4) %arraydecay2, i64 4
+  ret ptr addrspace(4) %add.ptr
+}
+
+
+define internal spir_func i32 @Bar55(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %0 = load i64, ptr addrspace(4) %bits_num
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func void @Bar56(ptr addrspace(4) %this, ptr addrspace(4) align 4 %bits, ptr byval(%"range") %pos) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %bits.addr = alloca ptr addrspace(4)
+  %Res = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %bits, ptr %bits.addr
+  %pos.ascast = addrspacecast ptr %pos to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1
+  store i64 %0, ptr %Res
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %2 = load i64, ptr %Res
+  %and = and i64 %2, %call
+  store i64 %and, ptr %Res
+  %call2 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %call3 = call spir_func i32 @Bar55(ptr addrspace(4) %this1) 
+  %conv = zext i32 %call3 to i64
+  %cmp = icmp ult i64 %call2, %conv
+  br i1 %cmp, label %if.then, label %if.else
+
+if.else:                                          ; preds = %entry
+  %3 = load ptr addrspace(4), ptr %bits.addr
+  store i32 0, ptr addrspace(4) %3, align 4
+  br label %if.end11
+
+if.then:                                          ; preds = %entry
+  %call4 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %cmp5 = icmp ugt i64 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then
+  %call7 = call spir_func i64 @Bar52(ptr addrspace(4) %pos.ascast, i32 0) 
+  %4 = load i64, ptr %Res
+  %shr = lshr i64 %4, %call7
+  store i64 %shr, ptr %Res
+  br label %if.end
+
+if.end:                                           ; preds = %if.then6, %if.then
+  %call8 = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 32) 
+  %5 = load i64, ptr %Res
+  %and9 = and i64 %5, %call8
+  store i64 %and9, ptr %Res
+  %6 = load i64, ptr %Res
+  %conv10 = trunc i64 %6 to i32
+  %7 = load ptr addrspace(4), ptr %bits.addr
+  store i32 %conv10, ptr addrspace(4) %7, align 4
+  br label %if.end11
+
+if.end11:                                         ; preds = %if.else, %if.end
+  ret void
+}
+
+
+define internal spir_func i64 @Bar58(ptr addrspace(4) %this, i64 %bn) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %bn.addr = alloca i64
+  %one = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %bn, ptr %bn.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %bn.addr
+  %cmp = icmp ule i64 %0, 64
+  %1 = addrspacecast ptr addrspace(1) @.str.2 to ptr addrspace(4)
+  %2 = addrspacecast ptr addrspace(1) @.str.1 to ptr addrspace(4)
+  %3 = addrspacecast ptr addrspace(1) @__PRETTY_FUNCTION2 to ptr addrspace(4)
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %entry
+  call spir_func void @__assert_fail(ptr addrspace(4) %1, ptr addrspace(4) %2, i32 327, ptr addrspace(4) %3) 
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.false
+  store i64 1, ptr %one
+  %4 = load i64, ptr %bn.addr
+  %cmp2 = icmp eq i64 %4, 64
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.end:                                           ; preds = %cond.end
+  %5 = load i64, ptr %one
+  %6 = load i64, ptr %bn.addr
+  %shl = shl i64 %5, %6
+  %7 = load i64, ptr %one
+  %sub3 = sub i64 %shl, %7
+  store i64 %sub3, ptr %retval
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.then:                                          ; preds = %cond.end
+  %8 = load i64, ptr %one
+  %sub = sub i64 0, %8
+  store i64 %sub, ptr %retval
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then
+  %9 = load i64, ptr %retval
+  ret i64 %9
+}
+
+
+
+
+define internal spir_func void @Foo11(ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo60(ptr addrspace(4) %this, i64 %dim0) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %dim0.addr
+  store i64 %0, ptr addrspace(4) %common_array1
+  ret void
+}
+
+
+define internal spir_func i32 @Foo59(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %LocalId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %call = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp1) 
+  store i32 %call, ptr %LocalId, align 4
+  %0 = addrspacecast ptr addrspace(1) @_ZSt6ignore to ptr addrspace(4)
+  %call2 = call spir_func align 1 ptr addrspace(4) @Bar15(ptr addrspace(4) align 1 %0, ptr addrspace(4) %g.ascast) 
+  %call3 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %1 = load i32, ptr %LocalId, align 4
+  %call4 = call spir_func i32 @Foo50(i32 3, i32 %call3, i32 %1) 
+  ret i32 %call4
+}
+
+
+define internal spir_func i32 @Foo47(ptr byval(%"ss_sub_group_mask") %Mask) {
+entry:
+  %retval = alloca i32, align 4
+  %MemberMask = alloca %"vec.16", align 16
+  %agg.tmp = alloca %"ss_sub_group_mask"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %MemberMask.ascast = addrspacecast ptr %MemberMask to ptr addrspace(4)
+  call spir_func void @Bar19(ptr addrspace(4) dead_on_unwind writable sret(%"vec.16") align 16 %MemberMask.ascast, ptr byval(%"ss_sub_group_mask") %agg.tmp) 
+  %call = call spir_func <4 x i32> @Bar59(ptr addrspace(4) align 16 %MemberMask.ascast) 
+  %call1 = call spir_func i32 @_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(i32 3, i32 2, <4 x i32> %call) 
+  ret i32 %call1
+}
+
+
+define internal spir_func <4 x i32> @Bar59(ptr addrspace(4) align 16 %x) {
+entry:
+  %retval = alloca <4 x i32>, align 16
+  %x.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %x, ptr %x.addr
+  %0 = load ptr addrspace(4), ptr %x.addr
+  %call = call spir_func <4 x i32> @Bar60(ptr addrspace(4) align 16 %0) 
+  ret <4 x i32> %call
+}
+
+declare dso_local spir_func i32 @_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(i32, i32, <4 x i32>) 
+
+
+define internal spir_func <4 x i32> @Bar60(ptr addrspace(4) align 16 %from) {
+entry:
+  %retval = alloca <4 x i32>, align 16
+  %from.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %from, ptr %from.addr
+  %0 = load ptr addrspace(4), ptr %from.addr
+  %1 = load <4 x i32>, ptr addrspace(4) %0, align 16
+  ret <4 x i32> %1
+}
+
+
+define internal spir_func i32 @Foo52(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp3 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %g.ascast) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %1 = load i32, ptr %delta.addr, align 4
+  %conv = zext i32 %1 to i64
+  %cmp = icmp uge i64 %0, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, ptr %delta.addr, align 4
+  %conv1 = zext i32 %2 to i64
+  %call2 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %3 = load i64, ptr addrspace(4) %call2
+  %sub = sub i64 %3, %conv1
+  store i64 %sub, ptr addrspace(4) %call2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call4 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp3) 
+  store i32 %call4, ptr %TargetId, align 4
+  %call5 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %4 = load i32, ptr %TargetId, align 4
+  %call6 = call spir_func i32 @Foo50(i32 3, i32 %call5, i32 %4) 
+  ret i32 %call6
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  ret ptr addrspace(4) %arrayidx
+}
+
+
+define internal spir_func i32 @Foo72(ptr byval(%"tangle_group") %g, i32 %x, i32 %delta) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %delta.addr = alloca i32, align 4
+  %TargetLocalId = alloca %"range"
+  %TargetId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp6 = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %TargetLocalId.ascast = addrspacecast ptr %TargetLocalId to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i32 %delta, ptr %delta.addr, align 4
+  call spir_func void @Foo51(ptr addrspace(4) dead_on_unwind writable sret(%"range") %TargetLocalId.ascast, ptr addrspace(4) %g.ascast) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %1 = load i32, ptr %delta.addr, align 4
+  %conv = zext i32 %1 to i64
+  %add = add i64 %0, %conv
+  %call1 = call spir_func i32 @Bar61(ptr addrspace(4) %g.ascast) 
+  %conv2 = zext i32 %call1 to i64
+  %cmp = icmp ult i64 %add, %conv2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32, ptr %delta.addr, align 4
+  %conv3 = zext i32 %2 to i64
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %TargetLocalId.ascast, i32 0) 
+  %3 = load i64, ptr addrspace(4) %call4
+  %add5 = add i64 %3, %conv3
+  store i64 %add5, ptr addrspace(4) %call4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call7 = call spir_func i32 @Foo48(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"range") %agg.tmp6) 
+  store i32 %call7, ptr %TargetId, align 4
+  %call8 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  %4 = load i32, ptr %TargetId, align 4
+  %call9 = call spir_func i32 @Foo50(i32 3, i32 %call8, i32 %4) 
+  ret i32 %call9
+}
+
+
+define internal spir_func i32 @Bar61(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  call spir_func void @Foo97(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) %this1) 
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %0 = load i64, ptr addrspace(4) %call
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+
+define internal spir_func void @Foo97(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %call = call spir_func i32 @Bar62(ptr addrspace(4) %Mask1) 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar62(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i32, align 4
+  %this.addr = alloca ptr addrspace(4)
+  %count = alloca i32, align 4
+  %word = alloca i64
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i32 0, ptr %count, align 4
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr addrspace(4) %Bits1
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load i64, ptr addrspace(4) %bits_num
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %and = and i64 %0, %call
+  store i64 %and, ptr %word
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %2 = load i64, ptr %word
+  %tobool = icmp ne i64 %2, 0
+  br i1 %tobool, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %3 = load i32, ptr %count, align 4
+  ret i32 %3
+
+while.body:                                       ; preds = %while.cond
+  %4 = load i64, ptr %word
+  %sub = sub i64 %4, 1
+  %5 = load i64, ptr %word
+  %and2 = and i64 %5, %sub
+  store i64 %and2, ptr %word
+  %6 = load i32, ptr %count, align 4
+  %inc = add i32 %6, 1
+  store i32 %inc, ptr %count, align 4
+  br label %while.cond
+}
+
+
+define internal spir_func void @Foo9(ptr addrspace(4) %this, i64 %dim0) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %dim0.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %dim0, ptr %dim0.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i64, ptr %dim0.addr
+  call spir_func void @Foo60(ptr addrspace(4) %this1, i64 %0) 
+  ret void
+}
+
+
+define internal spir_func i32 @Foo61(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar63(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar63(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar64(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar64(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 1, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+declare dso_local spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32, i32, i32) 
+
+
+define internal spir_func i32 @Bar12(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar65(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar65(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar66(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar66(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 2, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+
+define internal spir_func i32 @Bar11(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"nd_item", align 1
+  %agg.tmp2 = alloca %"nd_item", align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar67(ptr byval(%"tangle_group") %agg.tmp, ptr byval(%"nd_item") align 1 %agg.tmp1, i32 %4, ptr byval(%"nd_item") align 1 %agg.tmp2) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar67(ptr byval(%"tangle_group") %g, ptr byval(%"nd_item") align 1 %0, i32 %x, ptr byval(%"nd_item") align 1 %1){
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %2 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %3 = addrspacecast ptr %1 to ptr addrspace(4)
+  %4 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar68(ptr byval(%"tangle_group") %agg.tmp, i32 %4) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar68(ptr byval(%"tangle_group") %0, i32 %x)   {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %Arg = alloca i32, align 4
+  %Ret = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %2 = load i32, ptr %x.addr, align 4
+  store i32 %2, ptr %Arg, align 4
+  %3 = load i32, ptr %Arg, align 4
+  %call = call spir_func i32 @_Z27__spirv_GroupNonUniformIAddIiET_N5__spv5Scope4FlagEjS0_(i32 3, i32 0, i32 %3) 
+  store i32 %call, ptr %Ret, align 4
+  %4 = load i32, ptr %Ret, align 4
+  ret i32 %4
+}
+
+
+define internal spir_func zeroext i1 @Foo66(ptr byval(%"tangle_group") %0, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %2 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %2 to i1
+  %call = call spir_func zeroext i1 @Foo99(i32 3, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+declare dso_local spir_func zeroext i1 @Foo99(i32, i1 zeroext) 
+
+
+define internal spir_func zeroext i1 @Bar10(ptr byval(%"tangle_group") %0, i1 zeroext %pred) {
+entry:
+  %retval = alloca i1, align 1
+  %pred.addr = alloca i8, align 1
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  %storedv = zext i1 %pred to i8
+  store i8 %storedv, ptr %pred.addr, align 1
+  %2 = load i8, ptr %pred.addr, align 1  
+  %loadedv = trunc i8 %2 to i1
+  %call = call spir_func zeroext i1 @_Z26__spirv_GroupNonUniformAnyN5__spv5Scope4FlagEb(i32 3, i1 zeroext %loadedv) 
+  ret i1 %call
+}
+
+declare dso_local spir_func zeroext i1 @_Z26__spirv_GroupNonUniformAnyN5__spv5Scope4FlagEb(i32, i1 zeroext) 
+
+
+define internal spir_func void @Foo98(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr byval(%"range") %0, i64 %linear_id)   {
+entry:
+  %linear_id.addr = alloca i64
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store i64 %linear_id, ptr %linear_id.addr
+  %2 = load i64, ptr %linear_id.addr
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %2) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar69(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %agg.tmp1 = alloca %"range"
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func i32 @Bar70(ptr byval(%"tangle_group") %agg.tmp, i32 %0, ptr byval(%"range") %agg.tmp1) 
+  ret i32 %call
+}
+
+
+define internal spir_func i32 @Bar70(ptr byval(%"tangle_group") %g, i32 %x, ptr byval(%"range") %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %VecId = alloca %"range"
+  %OCLX = alloca i32, align 4
+  %WideOCLX = alloca i32, align 4
+  %OCLId = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %VecId.ascast = addrspacecast ptr %VecId to ptr addrspace(4)
+  %OCLX.ascast = addrspacecast ptr %OCLX to ptr addrspace(4)
+  %WideOCLX.ascast = addrspacecast ptr %WideOCLX to ptr addrspace(4)
+  %OCLId.ascast = addrspacecast ptr %OCLId to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  %local_id.ascast = addrspacecast ptr %local_id to ptr addrspace(4)
+  %0 = load i32, ptr %x.addr, align 4
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %local_id.ascast, i32 0) 
+  %1 = load i64, ptr addrspace(4) %call
+  %call1 = call spir_func i32 @Bar71(ptr byval(%"tangle_group") %agg.tmp, i32 %0, i64 %1) 
+  ret i32 %call1
+}
+
+
+define internal spir_func i32 @Bar71(ptr byval(%"tangle_group") %g, i32 %x, i64 %local_id) {
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %local_id.addr = alloca i64
+  %LocalId = alloca i32, align 4
+  %agg.tmp = alloca %"tangle_group"
+  %GroupLocalId = alloca i32, align 4
+  %OCLX = alloca i32, align 4
+  %WideOCLX = alloca i32, align 4
+  %OCLId = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %x.addr.ascast = addrspacecast ptr %x.addr to ptr addrspace(4)
+  %GroupLocalId.ascast = addrspacecast ptr %GroupLocalId to ptr addrspace(4)
+  store i32 %x, ptr %x.addr, align 4
+  store i64 %local_id, ptr %local_id.addr
+  %0 = load i64, ptr %local_id.addr
+  %conv = trunc i64 %0 to i32
+  %call = call spir_func i32 @Bar17(ptr byval(%"tangle_group") %agg.tmp, i32 %conv) 
+  store i32 %call, ptr %LocalId, align 4
+  %1 = load i32, ptr %LocalId, align 4
+  store i32 %1, ptr %GroupLocalId, align 4
+  %call1 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %x.addr.ascast) 
+  store i32 %call1, ptr %OCLX, align 4
+  %2 = load i32, ptr %OCLX, align 4
+  store i32 %2, ptr %WideOCLX, align 4
+  %call2 = call spir_func i32 @Foo49(ptr addrspace(4) align 4 %GroupLocalId.ascast) 
+  store i32 %call2, ptr %OCLId, align 4
+  %3 = load i32, ptr %WideOCLX, align 4
+  %4 = load i32, ptr %OCLId, align 4
+  %call3 = call spir_func i32 @_Z32__spirv_GroupNonUniformBroadcastIjjET_N5__spv5Scope4FlagES0_T0_(i32 3, i32 %3, i32 %4) 
+  ret i32 %call3
+}
+
+declare dso_local spir_func i32 @_Z32__spirv_GroupNonUniformBroadcastIjjET_N5__spv5Scope4FlagES0_T0_(i32, i32, i32) 
+
+
+define internal spir_func void @Foo96(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()   {
+entry:
+  %retval = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4
+  ret i32 %0
+}
+
+
+define internal spir_func i64 @Foo77(ptr addrspace(4) %this, ptr byval(%"range") %Id) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %Result = alloca i64
+  %ref.tmp = alloca %class.anon.15
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %Result.ascast = addrspacecast ptr %Result to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %Id.ascast = addrspacecast ptr %Id to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %Result
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %Result2 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Result.ascast, ptr %Result2
+  %Id3 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %Id.ascast, ptr %Id3
+  call spir_func void @Foo79(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %Result
+  ret i64 %1
+}
+
+
+define internal spir_func ptr addrspace(1) @Foo78(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(1)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr addrspace(4) %0
+  ret ptr addrspace(1) %1
+}
+
+
+define internal spir_func void @Foo79(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo80(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo80(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo81(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  ret i64 0
+}
+
+
+define internal spir_func void @Foo81(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Result = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Result
+  %3 = load i64, ptr addrspace(4) %2
+  %call = call spir_func ptr addrspace(4) @Bar72(ptr addrspace(4) %1) 
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call2 = call spir_func i64 @Foo37(ptr addrspace(4) %call, i32 %conv) 
+  %mul = mul i64 %3, %call2
+  %Id = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 2
+  %5 = load ptr addrspace(4), ptr addrspace(4) %Id
+  %6 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %6 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %5, i32 %conv3) 
+  %7 = load i64, ptr addrspace(4) %call4
+  %add = add i64 %mul, %7
+  %Result5 = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %8 = load ptr addrspace(4), ptr addrspace(4) %Result5
+  store i64 %add, ptr addrspace(4) %8
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar72(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func i64 @Foo37(ptr addrspace(4) %this, i32 %dimension) {
+entry:
+  %this.addr.i = alloca ptr addrspace(4)
+  %dimension.addr.i = alloca i32, align 4
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %dimension.addr = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i32 %dimension, ptr %dimension.addr, align 4
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = load i32, ptr %dimension.addr, align 4
+  store ptr addrspace(4) %this1, ptr %this.addr.i
+  store i32 %0, ptr %dimension.addr.i, align 4
+  %this1.i = load ptr addrspace(4), ptr %this.addr.i
+  %common_array1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load i32, ptr %dimension.addr, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds [1 x i64], ptr addrspace(4) %common_array1, i64 0, i64 %idxprom
+  %2 = load i64, ptr addrspace(4) %arrayidx
+  ret i64 %2
+}
+
+
+define internal spir_func void @Foo95(ptr byval(%"tangle_group") %g, i32 %FenceScope, i32 %Order) {
+entry:
+  %FenceScope.addr = alloca i32, align 4
+  %Order.addr = alloca i32, align 4
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  store i32 %FenceScope, ptr %FenceScope.addr, align 4
+  store i32 %Order, ptr %Order.addr, align 4
+  %0 = load i32, ptr %FenceScope.addr, align 4
+  %call = call spir_func i32 @Bar73(i32 %0) 
+  %1 = load i32, ptr %Order.addr, align 4
+  %call1 = call spir_func i32 @Bar74(i32 %1) 
+  %or = or i32 %call1, 128
+  %or2 = or i32 %or, 256
+  %or3 = or i32 %or2, 512
+  call spir_func void @_Z21__spirv_MemoryBarrierjj(i32 %call, i32 %or3) 
+  ret void
+}
+
+
+define internal spir_func i32 @Bar73(i32 %Scope){
+entry:
+  %retval = alloca i32, align 4
+  %Scope.addr = alloca i32, align 4
+  store i32 %Scope, ptr %Scope.addr, align 4
+  %0 = load i32, ptr %Scope.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+  ]
+
+sw.bb4:                                           ; preds = %entry
+  store i32 0, ptr %retval, align 4
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  store i32 1, ptr %retval, align 4
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  store i32 2, ptr %retval, align 4
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  store i32 3, ptr %retval, align 4
+  br label %return
+
+sw.bb:                                            ; preds = %entry
+  store i32 4, ptr %retval, align 4
+  br label %return
+
+return:                                           ; preds = %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %1 = load i32, ptr %retval, align 4
+  ret i32 %1
+
+sw.epilog:                                        ; preds = %entry
+  unreachable
+}
+
+
+define internal spir_func i32 @Bar74(i32 %Order){
+entry:
+  %retval = alloca i32, align 4
+  %Order.addr = alloca i32, align 4
+  %SpvOrder = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store i32 %Order, ptr %Order.addr, align 4
+  store i32 0, ptr %SpvOrder, align 4
+  %0 = load i32, ptr %Order.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 1, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb3
+    i32 5, label %sw.bb4
+  ]
+
+sw.bb4:                                           ; preds = %entry
+  store i32 16, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  store i32 8, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  store i32 4, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry, %entry
+  store i32 2, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.bb:                                            ; preds = %entry
+  store i32 0, ptr %SpvOrder, align 4
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb, %entry
+  %1 = load i32, ptr %SpvOrder, align 4
+  %or = or i32 %1, 128
+  %or5 = or i32 %or, 256
+  %or6 = or i32 %or5, 512
+  ret i32 %or6
+}
+
+declare dso_local spir_func void @_Z21__spirv_MemoryBarrierjj(i32, i32) 
+
+
+define internal spir_func i64 @Foo93(ptr addrspace(4) %this, ptr byval(%"range") %Id) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %Result = alloca i64
+  %ref.tmp = alloca %class.anon.15
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %Result.ascast = addrspacecast ptr %Result to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %Id.ascast = addrspacecast ptr %Id to ptr addrspace(4)
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %Result
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %Result2 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %Result.ascast, ptr %Result2
+  %Id3 = getelementptr inbounds %class.anon.15, ptr %ref.tmp, i32 0, i32 2
+  store ptr addrspace(4) %Id.ascast, ptr %Id3
+  call spir_func void @Bar75(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %Result
+  ret i64 %1
+}
+
+
+define internal spir_func ptr addrspace(1) @Foo94(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(1)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = getelementptr inbounds nuw %"accessor", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(1), ptr addrspace(4) %0
+  ret ptr addrspace(1) %1
+}
+
+
+define internal spir_func void @Bar75(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Bar76(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Bar76(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Bar767(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Bar767(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Result = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Result
+  %3 = load i64, ptr addrspace(4) %2
+  %call = call spir_func ptr addrspace(4) @Bar78(ptr addrspace(4) %1) 
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call2 = call spir_func i64 @Foo37(ptr addrspace(4) %call, i32 %conv) 
+  %mul = mul i64 %3, %call2
+  %Id = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 2
+  %5 = load ptr addrspace(4), ptr addrspace(4) %Id
+  %6 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %6 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %5, i32 %conv3) 
+  %7 = load i64, ptr addrspace(4) %call4
+  %add = add i64 %mul, %7
+  %Result5 = getelementptr inbounds nuw %class.anon.15, ptr addrspace(4) %this1, i32 0, i32 1
+  %8 = load ptr addrspace(4), ptr addrspace(4) %Result5
+  store i64 %add, ptr addrspace(4) %8
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Bar78(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func void @Foo44(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, ptr byval(%"nd_item") align 1 %g, i1 zeroext %predicate) {
+entry:
+  %predicate.addr = alloca i8, align 1
+  %res = alloca <4 x i32>, align 16
+  %val = alloca i64
+  %ref.tmp = alloca %"range"
+  %cleanup.dest.slot = alloca i32, align 4
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %g.ascast = addrspacecast ptr %g to ptr addrspace(4)
+  %storedv = zext i1 %predicate to i8
+  store i8 %storedv, ptr %predicate.addr, align 1
+  %0 = load i8, ptr %predicate.addr, align 1  
+  %loadedv = trunc i8 %0 to i1
+  %call = call spir_func <4 x i32> @_Z29__spirv_GroupNonUniformBallotjb(i32 3, i1 zeroext %loadedv) 
+  store <4 x i32> %call, ptr %res, align 16
+  %1 = load <4 x i32>, ptr %res, align 16
+  %vecext = extractelement <4 x i32> %1, i32 0
+  %conv = zext i32 %vecext to i64
+  store i64 %conv, ptr %val
+  %2 = load <4 x i32>, ptr %res, align 16
+  %vecext1 = extractelement <4 x i32> %2, i32 1
+  %conv2 = zext i32 %vecext1 to i64
+  %shl = shl i64 %conv2, 32
+  %3 = load i64, ptr %val
+  %or = or i64 %3, %shl
+  store i64 %or, ptr %val
+  %4 = load i64, ptr %val
+  call spir_func void @Bar79(ptr addrspace(4) dead_on_unwind writable sret(%"range") %ref.tmp.ascast, ptr addrspace(4) align 1 %g.ascast) 
+  %call3 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %ref.tmp.ascast, i32 0) 
+  %5 = load i64, ptr addrspace(4) %call3
+  call spir_func void @Bar80(ptr addrspace(4) dead_on_unwind writable sret(%"ss_sub_group_mask") %agg.result, i64 %4, i64 %5) 
+  ret void
+}
+
+
+define internal spir_func void @Foo45(ptr addrspace(4) %this, ptr byval(%"ss_sub_group_mask") %m) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Mask1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  ret void
+}
+
+declare dso_local spir_func <4 x i32> @_Z29__spirv_GroupNonUniformBallotjb(i32, i1 zeroext) 
+
+
+define internal spir_func void @Bar79(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result, ptr addrspace(4) align 1 %this) {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %call = call spir_func i32 @_Z23__spirv_SubgroupMaxSizev() 
+  %conv = zext i32 %call to i64
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %conv) 
+  ret void
+}
+
+
+define internal spir_func void @Bar80(ptr addrspace(4) dead_on_unwind noalias writable sret(%"ss_sub_group_mask") %agg.result, i64 %Bits, i64 %BitsNum) {
+entry:
+  %Bits.addr = alloca i64
+  %BitsNum.addr = alloca i64
+  store i64 %Bits, ptr %Bits.addr
+  store i64 %BitsNum, ptr %BitsNum.addr
+  %0 = load i64, ptr %Bits.addr
+  %1 = load i64, ptr %BitsNum.addr
+  call spir_func void @Bar81(ptr addrspace(4) %agg.result, i64 %0, i64 %1) 
+  ret void
+}
+
+
+define internal spir_func void @Bar81(ptr addrspace(4) %this, i64 %rhs, i64 %bn) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %rhs.addr = alloca i64
+  %bn.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %rhs, ptr %rhs.addr
+  store i64 %bn, ptr %bn.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Bits1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load i64, ptr %rhs.addr
+  %1 = load i64, ptr %bn.addr
+  %call = call spir_func i64 @Bar58(ptr addrspace(4) %this1, i64 %1) 
+  %and = and i64 %0, %call
+  store i64 %and, ptr addrspace(4) %Bits1
+  %bits_num = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load i64, ptr %bn.addr
+  store i64 %2, ptr addrspace(4) %bits_num
+  %bits_num2 = getelementptr inbounds nuw %"ss_sub_group_mask", ptr addrspace(4) %this1, i32 0, i32 1
+  %3 = load i64, ptr addrspace(4) %bits_num2
+  %cmp = icmp ule i64 %3, 64
+  %4 = addrspacecast ptr addrspace(1) @.str to ptr addrspace(4)
+  %5 = addrspacecast ptr addrspace(1) @.str.1 to ptr addrspace(4)
+  %6 = addrspacecast ptr addrspace(1) @__PRETTY_FUNCTION1 to ptr addrspace(4)
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %entry
+  call spir_func void @__assert_fail(ptr addrspace(4) %4, ptr addrspace(4) %5, i32 324, ptr addrspace(4) %6) 
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.false
+  ret void
+}
+
+
+define internal spir_func i32 @_Z23__spirv_SubgroupMaxSizev()   {
+entry:
+  %retval = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupMaxSize, align 4
+  ret i32 %0
+}
+
+
+define internal spir_func void @Init6(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Inv1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Inv1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @Inv2() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @Inv2() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo7(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Foo8(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Inv3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init2(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize1(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize2(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init4(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize3(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Init5(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @InitSize4(ptr addrspace(4) dead_on_unwind writable sret(%"range") %agg.result) 
+  ret void
+}
+
+
+define internal spir_func void @Foo23(ptr addrspace(4) dead_on_unwind noalias writable sret(%"group") %agg.result, ptr addrspace(4) %Global, ptr addrspace(4) %Local, ptr addrspace(4) %Group, ptr addrspace(4) %Index) {
+entry:
+  %Global.addr = alloca ptr addrspace(4)
+  %Local.addr = alloca ptr addrspace(4)
+  %Group.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"range"
+  store ptr addrspace(4) %Global, ptr %Global.addr
+  store ptr addrspace(4) %Local, ptr %Local.addr
+  store ptr addrspace(4) %Group, ptr %Group.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  %0 = load ptr addrspace(4), ptr %Global.addr
+  %1 = load ptr addrspace(4), ptr %Local.addr
+  %2 = load ptr addrspace(4), ptr %Group.addr
+  %3 = load ptr addrspace(4), ptr %Index.addr
+  call spir_func void @Bar82(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr byval(%"range") %agg.tmp, ptr addrspace(4) %3) 
+  ret void
+}
+
+
+define internal spir_func void @Foo24(ptr addrspace(4) dead_on_unwind noalias writable sret(%"item") %agg.result, ptr addrspace(4) %Extent, ptr addrspace(4) %Index, ptr addrspace(4) %Offset) {
+entry:
+  %Extent.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  %Offset.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Extent, ptr %Extent.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  store ptr addrspace(4) %Offset, ptr %Offset.addr
+  %0 = load ptr addrspace(4), ptr %Extent.addr
+  %1 = load ptr addrspace(4), ptr %Index.addr
+  %2 = load ptr addrspace(4), ptr %Offset.addr
+  call spir_func void @Foo29(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) 
+  ret void
+}
+
+
+define internal spir_func void @Foo25(ptr addrspace(4) dead_on_unwind noalias writable sret(%"item.22") %agg.result, ptr addrspace(4) %Extent, ptr addrspace(4) %Index) {
+entry:
+  %Extent.addr = alloca ptr addrspace(4)
+  %Index.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Extent, ptr %Extent.addr
+  store ptr addrspace(4) %Index, ptr %Index.addr
+  %0 = load ptr addrspace(4), ptr %Extent.addr
+  %1 = load ptr addrspace(4), ptr %Index.addr
+  call spir_func void @Foo27(ptr addrspace(4) %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1) 
+  ret void
+}
+
+
+define internal spir_func void @Foo26(ptr addrspace(4) dead_on_unwind noalias writable sret(%"nd_item") align 1 %agg.result, ptr addrspace(4) %Global, ptr addrspace(4) %Local, ptr addrspace(4) %Group) {
+entry:
+  %Global.addr = alloca ptr addrspace(4)
+  %Local.addr = alloca ptr addrspace(4)
+  %Group.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %Global, ptr %Global.addr
+  store ptr addrspace(4) %Local, ptr %Local.addr
+  store ptr addrspace(4) %Group, ptr %Group.addr
+  %0 = load ptr addrspace(4), ptr %Global.addr
+  %1 = load ptr addrspace(4), ptr %Local.addr
+  %2 = load ptr addrspace(4), ptr %Group.addr
+  call spir_func void @Foo28(ptr addrspace(4) align 1 %agg.result, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) 
+  ret void
+}
+
+
+define internal spir_func void @Foo28(ptr addrspace(4) align 1 %this, ptr addrspace(4) %0, ptr addrspace(4) %1, ptr addrspace(4) %2) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %.addr = alloca ptr addrspace(4)
+  %.addr1 = alloca ptr addrspace(4)
+  %.addr2 = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %0, ptr %.addr
+  store ptr addrspace(4) %1, ptr %.addr1
+  store ptr addrspace(4) %2, ptr %.addr2
+  %this3 = load ptr addrspace(4), ptr %this.addr
+  ret void
+}
+
+
+define internal spir_func void @Foo27(ptr addrspace(4) %this, ptr addrspace(4) %extent, ptr addrspace(4) %index) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %extent.addr = alloca ptr addrspace(4)
+  %index.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %extent, ptr %extent.addr
+  store ptr addrspace(4) %index, ptr %index.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MImpl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MExtent2 = bitcast ptr addrspace(4) %MImpl1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %extent.addr
+  %MIndex = getelementptr inbounds nuw %"sd_ItemBase.23", ptr addrspace(4) %MImpl1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %index.addr
+  ret void
+}
+
+
+
+
+define internal spir_func void @Foo29(ptr addrspace(4) %this, ptr addrspace(4) %extent, ptr addrspace(4) %index, ptr addrspace(4) %offset) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %extent.addr = alloca ptr addrspace(4)
+  %index.addr = alloca ptr addrspace(4)
+  %offset.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %extent, ptr %extent.addr
+  store ptr addrspace(4) %index, ptr %index.addr
+  store ptr addrspace(4) %offset, ptr %offset.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %MImpl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MExtent2 = bitcast ptr addrspace(4) %MImpl1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %extent.addr
+  %MIndex = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %MImpl1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %index.addr
+  %MOffset = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %MImpl1, i32 0, i32 2
+  %2 = load ptr addrspace(4), ptr %offset.addr
+  ret void
+}
+
+
+define internal spir_func void @Bar82(ptr addrspace(4) %this, ptr addrspace(4) %G, ptr addrspace(4) %L, ptr byval(%"range") %GroupRange, ptr addrspace(4) %I) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %G.addr = alloca ptr addrspace(4)
+  %L.addr = alloca ptr addrspace(4)
+  %I.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  store ptr addrspace(4) %G, ptr %G.addr
+  store ptr addrspace(4) %L, ptr %L.addr
+  store ptr addrspace(4) %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %globalRange1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %0 = load ptr addrspace(4), ptr %G.addr
+  %localRange = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 1
+  %1 = load ptr addrspace(4), ptr %L.addr
+  %groupRange = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 2
+  %index = getelementptr inbounds nuw %"group", ptr addrspace(4) %this1, i32 0, i32 3
+  %2 = load ptr addrspace(4), ptr %I.addr
+  ret void
+}
+
+
+define internal spir_func void @InitSize4(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv15getGlobalOffsetILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv15getGlobalOffsetILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z22__spirv_GlobalOffset_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z22__spirv_GlobalOffset_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalOffset, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv20getLocalInvocationIdILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv20getLocalInvocationIdILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize2(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv14getWorkgroupIdILi0EEEmv() 
+  call spir_func void @Foo46(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv14getWorkgroupIdILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z21__spirv_WorkgroupId_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z21__spirv_WorkgroupId_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @InitSize1(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv16getNumWorkgroupsILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv16getNumWorkgroupsILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z23__spirv_NumWorkgroups_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z23__spirv_NumWorkgroups_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInNumWorkgroups, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Inv3(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv16getWorkgroupSizeILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv16getWorkgroupSizeILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z23__spirv_WorkgroupSize_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z23__spirv_WorkgroupSize_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInWorkgroupSize, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo8(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  %call = call spir_func i64 @_ZN7__spirv13getGlobalSizeILi0EEEmv() 
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func i64 @_ZN7__spirv13getGlobalSizeILi0EEEmv() {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %call = call spir_func i64 @_Z20__spirv_GlobalSize_xv() 
+  ret i64 %call
+}
+
+
+define internal spir_func i64 @_Z20__spirv_GlobalSize_xv()   {
+entry:
+  %retval = alloca i64
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalSize, align 32
+  %1 = extractelement <3 x i64> %0, i64 0
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo30(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo33(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func i64 @Foo32(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %TotalOffset = alloca i64
+  %ref.tmp = alloca %class.anon.7
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %TotalOffset.ascast = addrspacecast ptr %TotalOffset to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %TotalOffset
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %TotalOffset2 = getelementptr inbounds %class.anon.7, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %TotalOffset.ascast, ptr %TotalOffset2
+  call spir_func void @Foo34(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %TotalOffset
+  ret i64 %1
+}
+
+
+define internal spir_func void @Foo34(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo35(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo35(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo36(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo36(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %TotalOffset = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset
+  %3 = load i64, ptr addrspace(4) %2
+  %impl1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call = call spir_func i64 @Foo37(ptr addrspace(4) %MemRange, i32 %conv) 
+  %mul = mul i64 %3, %call
+  %TotalOffset2 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %5 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset2
+  store i64 %mul, ptr addrspace(4) %5
+  %impl32 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %Offset3 = bitcast ptr addrspace(4) %impl32 to ptr addrspace(4)
+  %6 = load i64, ptr %I.addr
+  %conv4 = trunc i64 %6 to i32
+  %call5 = call spir_func i64 @Foo37(ptr addrspace(4) %Offset3, i32 %conv4) 
+  %TotalOffset6 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %7 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset6
+  %8 = load i64, ptr addrspace(4) %7
+  %add = add i64 %8, %call5
+  store i64 %add, ptr addrspace(4) %7
+  ret void
+}
+
+
+define internal spir_func void @Foo33(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo38(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo38(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Offset = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Offset
+  %3 = load i64, ptr %I.addr
+  %conv = trunc i64 %3 to i32
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %2, i32 %conv) 
+  %4 = load i64, ptr addrspace(4) %call
+  %call2 = call spir_func ptr addrspace(4) @Foo39(ptr addrspace(4) %1) 
+  %5 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %5 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call2, i32 %conv3) 
+  store i64 %4, ptr addrspace(4) %call4
+  %AccessRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 2
+  %6 = load ptr addrspace(4), ptr addrspace(4) %AccessRange
+  %7 = load i64, ptr %I.addr
+  %conv5 = trunc i64 %7 to i32
+  %call6 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %6, i32 %conv5) 
+  %8 = load i64, ptr addrspace(4) %call6
+  %call7 = call spir_func ptr addrspace(4) @Foo40A(ptr addrspace(4) %1) 
+  %9 = load i64, ptr %I.addr
+  %conv8 = trunc i64 %9 to i32
+  %call9 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call7, i32 %conv8) 
+  store i64 %8, ptr addrspace(4) %call9
+  %MemRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 3
+  %10 = load ptr addrspace(4), ptr addrspace(4) %MemRange
+  %11 = load i64, ptr %I.addr
+  %conv10 = trunc i64 %11 to i32
+  %call11 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %10, i32 %conv10) 
+  %12 = load i64, ptr addrspace(4) %call11
+  %call12 = call spir_func ptr addrspace(4) @Foo41A(ptr addrspace(4) %1) 
+  %13 = load i64, ptr %I.addr
+  %conv13 = trunc i64 %13 to i32
+  %call14 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call12, i32 %conv13) 
+  store i64 %12, ptr addrspace(4) %call14
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo39(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %Offset2 = bitcast ptr addrspace(4) %impl1 to ptr addrspace(4)
+  ret ptr addrspace(4) %Offset2
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo40A(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 1
+  ret ptr addrspace(4) %AccessRange
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo41A(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+define internal spir_func void @Foo13(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Foo14(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func i64 @Foo21(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca i64
+  %this.addr = alloca ptr addrspace(4)
+  %TotalOffset = alloca i64
+  %ref.tmp = alloca %class.anon.7
+  %cleanup.dest.slot = alloca i32, align 4
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  %TotalOffset.ascast = addrspacecast ptr %TotalOffset to ptr addrspace(4)
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  store i64 0, ptr %TotalOffset
+  %0 = bitcast ptr %ref.tmp to ptr
+  store ptr addrspace(4) %this1, ptr %0
+  %TotalOffset2 = getelementptr inbounds %class.anon.7, ptr %ref.tmp, i32 0, i32 1
+  store ptr addrspace(4) %TotalOffset.ascast, ptr %TotalOffset2
+  call spir_func void @Bar83(ptr addrspace(4) %ref.tmp.ascast) 
+  %1 = load i64, ptr %TotalOffset
+  ret i64 %1
+}
+
+
+define internal spir_func void @Bar83(ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %agg.tmp = alloca %"nd_item", align 1
+  store ptr addrspace(4) %f, ptr %f.addr
+  %0 = load ptr addrspace(4), ptr %f.addr
+  call spir_func void @Bar84(ptr byval(%"nd_item") align 1 %agg.tmp, ptr addrspace(4) %0) 
+  ret void
+}
+
+
+define internal spir_func void @Bar84(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Bar85(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Bar85(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %TotalOffset = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset
+  %3 = load i64, ptr addrspace(4) %2
+  %impl1 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  %4 = load i64, ptr %I.addr
+  %conv = trunc i64 %4 to i32
+  %call = call spir_func i64 @Foo37(ptr addrspace(4) %MemRange, i32 %conv) 
+  %mul = mul i64 %3, %call
+  %TotalOffset2 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %5 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset2
+  store i64 %mul, ptr addrspace(4) %5
+  %impl32 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+  %Offset3 = bitcast ptr addrspace(4) %impl32 to ptr addrspace(4)
+  %6 = load i64, ptr %I.addr
+  %conv4 = trunc i64 %6 to i32
+  %call5 = call spir_func i64 @Foo37(ptr addrspace(4) %Offset3, i32 %conv4) 
+  %TotalOffset6 = getelementptr inbounds nuw %class.anon.7, ptr addrspace(4) %this1, i32 0, i32 1
+  %7 = load ptr addrspace(4), ptr addrspace(4) %TotalOffset6
+  %8 = load i64, ptr addrspace(4) %7
+  %add = add i64 %8, %call5
+  store i64 %add, ptr addrspace(4) %7
+  ret void
+}
+
+
+define internal spir_func void @Foo14(ptr byval(%"nd_item") align 1 %0, ptr addrspace(4) %f) {
+entry:
+  %f.addr = alloca ptr addrspace(4)
+  %ref.tmp = alloca %"nd_item", align 1
+  %ref.tmp.ascast = addrspacecast ptr %ref.tmp to ptr addrspace(4)
+  %1 = addrspacecast ptr %0 to ptr addrspace(4)
+  store ptr addrspace(4) %f, ptr %f.addr
+  %2 = load ptr addrspace(4), ptr %f.addr
+  %call = call spir_func i64 @_ZNKSt17integral_constantImLm0EEcvmEv(ptr addrspace(4) align 1 %ref.tmp.ascast) 
+  call spir_func void @Foo15(ptr addrspace(4) %2, i64 %call) 
+  ret void
+}
+
+
+define internal spir_func void @Foo15(ptr addrspace(4) %this, i64 %I)  align 2  {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  %I.addr = alloca i64
+  store ptr addrspace(4) %this, ptr %this.addr
+  store i64 %I, ptr %I.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %0 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %1 = load ptr addrspace(4), ptr addrspace(4) %0
+  %Offset = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 1
+  %2 = load ptr addrspace(4), ptr addrspace(4) %Offset
+  %3 = load i64, ptr %I.addr
+  %conv = trunc i64 %3 to i32
+  %call = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %2, i32 %conv) 
+  %4 = load i64, ptr addrspace(4) %call
+  %call2 = call spir_func ptr addrspace(4) @Foo17(ptr addrspace(4) %1) 
+  %5 = load i64, ptr %I.addr
+  %conv3 = trunc i64 %5 to i32
+  %call4 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call2, i32 %conv3) 
+  store i64 %4, ptr addrspace(4) %call4
+  %AccessRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 2
+  %6 = load ptr addrspace(4), ptr addrspace(4) %AccessRange
+  %7 = load i64, ptr %I.addr
+  %conv5 = trunc i64 %7 to i32
+  %call6 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %6, i32 %conv5) 
+  %8 = load i64, ptr addrspace(4) %call6
+  %call7 = call spir_func ptr addrspace(4) @Foo18(ptr addrspace(4) %1) 
+  %9 = load i64, ptr %I.addr
+  %conv8 = trunc i64 %9 to i32
+  %call9 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call7, i32 %conv8) 
+  store i64 %8, ptr addrspace(4) %call9
+  %MemRange = getelementptr inbounds nuw %class.anon.6, ptr addrspace(4) %this1, i32 0, i32 3
+  %10 = load ptr addrspace(4), ptr addrspace(4) %MemRange
+  %11 = load i64, ptr %I.addr
+  %conv10 = trunc i64 %11 to i32
+  %call11 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %10, i32 %conv10) 
+  %12 = load i64, ptr addrspace(4) %call11
+  %call12 = call spir_func ptr addrspace(4) @Foo19(ptr addrspace(4) %1) 
+  %13 = load i64, ptr %I.addr
+  %conv13 = trunc i64 %13 to i32
+  %call14 = call spir_func ptr addrspace(4) @Foo16(ptr addrspace(4) %call12, i32 %conv13) 
+  store i64 %12, ptr addrspace(4) %call14
+  ret void
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo17(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %Offset2 = bitcast ptr addrspace(4) %impl1 to ptr addrspace(4)
+  ret ptr addrspace(4) %Offset2
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo18(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 1
+  ret ptr addrspace(4) %AccessRange
+}
+
+
+define internal spir_func ptr addrspace(4) @Foo19(ptr addrspace(4) %this) {
+entry:
+  %retval = alloca ptr addrspace(4)
+  %this.addr = alloca ptr addrspace(4)
+  %retval.ascast = addrspacecast ptr %retval to ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %impl1 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %impl1, i32 0, i32 2
+  ret ptr addrspace(4) %MemRange
+}
+
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) 
+
+
+define internal spir_func void @Foo12(ptr addrspace(4) dead_on_unwind noalias writable sret(%"range") %agg.result) {
+entry:
+  call spir_func void @Foo9(ptr addrspace(4) %agg.result, i64 0) 
+  ret void
+}
+
+
+define internal spir_func void @Foo10(ptr addrspace(4) %this, ptr byval(%"range") %Offset, ptr byval(%"range") %AccessRange, ptr byval(%"range") %MemoryRange) unnamed_addr {
+entry:
+  %this.addr = alloca ptr addrspace(4)
+  store ptr addrspace(4) %this, ptr %this.addr
+  %this1 = load ptr addrspace(4), ptr %this.addr
+  %Offset21 = bitcast ptr addrspace(4) %this1 to ptr addrspace(4)
+  %AccessRange3 = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %this1, i32 0, i32 1
+  %MemRange = getelementptr inbounds nuw %"detail::AccessorImplDevice", ptr addrspace(4) %this1, i32 0, i32 2
+  ret void
+}
+
+
+define internal spir_func void @__assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func)   {
+entry:
+  %call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() 
+  %call1 = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_yv() 
+  %call2 = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_zv() 
+  %call3 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() 
+  %call4 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_yv() 
+  %call5 = tail call spir_func i64 @_Z27__spirv_LocalInvocationId_zv() 
+  tail call spir_func void @__devicelib_assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func, i64 %call, i64 %call1, i64 %call2, i64 %call3, i64 %call4, i64 %call5) 
+  ret void
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_yv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, i64 8
+  %1 = load i64, ptr addrspace(1) %0
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z28__spirv_GlobalInvocationId_zv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, i64 16
+  %1 = load i64, ptr addrspace(1) %0, align 16
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_yv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, i64 8
+  %1 = load i64, ptr addrspace(1) %0
+  ret i64 %1
+}
+
+
+define internal spir_func i64 @_Z27__spirv_LocalInvocationId_zv() local_unnamed_addr   {
+entry:
+  %0 = getelementptr inbounds i8, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, i64 16
+  %1 = load i64, ptr addrspace(1) %0, align 16
+  ret i64 %1
+}
+
+
+define internal spir_func void @__devicelib_assert_fail(ptr addrspace(4) %expr, ptr addrspace(4) %file, i32 %line, ptr addrspace(4) %func, i64 %gid0, i64 %gid1, i64 %gid2, i64 %lid0, i64 %lid1, i64 %lid2) local_unnamed_addr   {
+entry:
+  %call.i = tail call spir_func i32 @_Z29__spirv_AtomicCompareExchangePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagES5_ii(ptr addrspace(1) @SPIR_AssertHappenedMem, i32 1, i32 16, i32 16, i32 1, i32 0) 
+  %cmp = icmp eq i32 %call.i, 0
+  %0 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 4
+  %1 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 261
+  %2 = getelementptr inbounds nuw i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 518
+  br i1 %cmp, label %if.then, label %if.end82
+
+if.then:                                          ; preds = %entry
+  store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 648)
+  store i64 %gid0, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 656)
+  store i64 %gid1, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 664)
+  store i64 %gid2, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 672)
+  store i64 %lid0, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 680)
+  store i64 %lid1, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 688)
+  store i64 %lid2, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @SPIR_AssertHappenedMem, i64 696)
+  %tobool.not = icmp eq ptr addrspace(4) %expr, null
+  br i1 %tobool.not, label %if.end, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.preheader, %for.inc
+  %ExprLength.0 = phi i32 [ %inc, %for.inc ], [ 0, %for.cond.preheader ]
+  %C.0 = phi ptr addrspace(4) [ %incdec.ptr, %for.inc ], [ %expr, %for.cond.preheader ]
+  %3 = load i8, ptr addrspace(4) %C.0, align 1
+  %cmp2.not = icmp eq i8 %3, 0
+  br i1 %cmp2.not, label %if.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr addrspace(4) %C.0, i64 1
+  %inc = add nuw nsw i32 %ExprLength.0, 1
+  br label %for.cond
+
+if.end:                                           ; preds = %for.cond, %if.then
+  %ExprLength.1 = phi i32 [ 0, %if.then ], [ %ExprLength.0, %for.cond ]
+  %tobool3.not = icmp eq ptr addrspace(4) %file, null
+  br i1 %tobool3.not, label %if.end16, label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %if.end
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.cond6.preheader, %for.inc12
+  %FileLength.0 = phi i32 [ %inc14, %for.inc12 ], [ 0, %for.cond6.preheader ]
+  %C5.0 = phi ptr addrspace(4) [ %incdec.ptr13, %for.inc12 ], [ %file, %for.cond6.preheader ]
+  %4 = load i8, ptr addrspace(4) %C5.0, align 1
+  %cmp8.not = icmp eq i8 %4, 0
+  br i1 %cmp8.not, label %if.end16, label %for.inc12
+
+for.inc12:                                        ; preds = %for.cond6
+  %incdec.ptr13 = getelementptr inbounds nuw i8, ptr addrspace(4) %C5.0, i64 1
+  %inc14 = add nuw nsw i32 %FileLength.0, 1
+  br label %for.cond6
+
+if.end16:                                         ; preds = %for.cond6, %if.end
+  %FileLength.1 = phi i32 [ 0, %if.end ], [ %FileLength.0, %for.cond6 ]
+  %tobool17.not = icmp eq ptr addrspace(4) %func, null
+  br i1 %tobool17.not, label %if.end30.thread, label %for.cond20.preheader
+
+for.cond20.preheader:                             ; preds = %if.end16
+  br label %for.cond20
+
+for.cond20:                                       ; preds = %for.cond20.preheader, %for.inc26
+  %FuncLength.0 = phi i32 [ %inc28, %for.inc26 ], [ 0, %for.cond20.preheader ]
+  %C19.0 = phi ptr addrspace(4) [ %incdec.ptr27, %for.inc26 ], [ %func, %for.cond20.preheader ]
+  %5 = load i8, ptr addrspace(4) %C19.0, align 1
+  %cmp22.not = icmp eq i8 %5, 0
+  br i1 %cmp22.not, label %if.end30, label %for.inc26
+
+for.inc26:                                        ; preds = %for.cond20
+  %incdec.ptr27 = getelementptr inbounds nuw i8, ptr addrspace(4) %C19.0, i64 1
+  %inc28 = add i32 %FuncLength.0, 1
+  br label %for.cond20
+
+if.end30:                                         ; preds = %for.cond20
+  %spec.select = tail call i32 @llvm.umin.i32(i32 %ExprLength.1, i32 256)
+  %MaxFileIdx.0 = tail call i32 @llvm.umin.i32(i32 %FileLength.1, i32 256)
+  %spec.select126 = tail call i32 @llvm.umin.i32(i32 %FuncLength.0, i32 128)
+  br label %6
+
+if.end30.thread:                                  ; preds = %if.end16
+  %spec.select116 = tail call i32 @llvm.umin.i32(i32 %ExprLength.1, i32 256)
+  %MaxFileIdx.0118 = tail call i32 @llvm.umin.i32(i32 %FileLength.1, i32 256)
+  br label %6
+
+6:                                                ; preds = %if.end30, %if.end30.thread
+  %MaxFileIdx.0124 = phi i32 [ %MaxFileIdx.0118, %if.end30.thread ], [ %MaxFileIdx.0, %if.end30 ]
+  %spec.select122 = phi i32 [ %spec.select116, %if.end30.thread ], [ %spec.select, %if.end30 ]
+  %7 = phi i32 [ 0, %if.end30.thread ], [ %spec.select126, %if.end30 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body44, %6
+  %lsr.iv9 = phi ptr addrspace(4) [ %scevgep10, %for.body44 ], [ %expr, %6 ]
+  %lsr.iv7 = phi ptr addrspace(1) [ %scevgep8, %for.body44 ], [ %0, %6 ]
+  %Idx.0 = phi i32 [ 0, %6 ], [ %inc48, %for.body44 ]
+  %cmp41 = icmp ult i32 %Idx.0, %spec.select122
+  br i1 %cmp41, label %for.body44, label %for.cond.cleanup42
+
+for.cond.cleanup42:                               ; preds = %for.cond40
+  %idxprom50 = zext nneg i32 %spec.select122 to i64
+  %arrayidx51 = getelementptr inbounds [257 x i8], ptr addrspace(1) %0, i64 0, i64 %idxprom50
+  store i8 0, ptr addrspace(1) %arrayidx51, align 1
+  br label %for.cond53
+
+for.cond53:                                       ; preds = %for.body57, %for.cond.cleanup42
+  %lsr.iv5 = phi ptr addrspace(4) [ %scevgep6, %for.body57 ], [ %file, %for.cond.cleanup42 ]
+  %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %for.body57 ], [ %1, %for.cond.cleanup42 ]
+  %Idx52.0 = phi i32 [ 0, %for.cond.cleanup42 ], [ %inc63, %for.body57 ]
+  %cmp54 = icmp ult i32 %Idx52.0, %MaxFileIdx.0124
+  br i1 %cmp54, label %for.body57, label %for.cond.cleanup55
+
+for.cond.cleanup55:                               ; preds = %for.cond53
+  %idxprom65 = zext nneg i32 %MaxFileIdx.0124 to i64
+  %arrayidx66 = getelementptr inbounds [257 x i8], ptr addrspace(1) %1, i64 0, i64 %idxprom65
+  store i8 0, ptr addrspace(1) %arrayidx66, align 1
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %for.cond.cleanup55
+  %lsr.iv1 = phi ptr addrspace(4) [ %scevgep2, %for.body72 ], [ %func, %for.cond.cleanup55 ]
+  %lsr.iv = phi ptr addrspace(1) [ %scevgep, %for.body72 ], [ %2, %for.cond.cleanup55 ]
+  %Idx67.0 = phi i32 [ 0, %for.cond.cleanup55 ], [ %inc78, %for.body72 ]
+  %cmp69 = icmp ult i32 %Idx67.0, %7
+  br i1 %cmp69, label %for.body72, label %for.cond.cleanup70
+
+for.cond.cleanup70:                               ; preds = %for.cond68
+  %idxprom80 = zext nneg i32 %7 to i64
+  %arrayidx81 = getelementptr inbounds [129 x i8], ptr addrspace(1) %2, i64 0, i64 %idxprom80
+  store i8 0, ptr addrspace(1) %arrayidx81, align 1
+  tail call spir_func void @_Z19__spirv_AtomicStorePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(1) @SPIR_AssertHappenedMem, i32 1, i32 16, i32 2) 
+  br label %if.end82
+
+if.end82:                                         ; preds = %for.cond.cleanup70, %entry
+  ret void
+
+for.body72:                                       ; preds = %for.cond68
+  %8 = load i8, ptr addrspace(4) %lsr.iv1, align 1
+  store i8 %8, ptr addrspace(1) %lsr.iv, align 1
+  %inc78 = add nuw nsw i32 %Idx67.0, 1
+  %scevgep = getelementptr i8, ptr addrspace(1) %lsr.iv, i64 1
+  %scevgep2 = getelementptr i8, ptr addrspace(4) %lsr.iv1, i64 1
+  br label %for.cond68
+
+for.body57:                                       ; preds = %for.cond53
+  %9 = load i8, ptr addrspace(4) %lsr.iv5, align 1
+  store i8 %9, ptr addrspace(1) %lsr.iv3, align 1
+  %inc63 = add nuw nsw i32 %Idx52.0, 1
+  %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
+  %scevgep6 = getelementptr i8, ptr addrspace(4) %lsr.iv5, i64 1
+  br label %for.cond53
+
+for.body44:                                       ; preds = %for.cond40
+  %10 = load i8, ptr addrspace(4) %lsr.iv9, align 1
+  store i8 %10, ptr addrspace(1) %lsr.iv7, align 1
+  %inc48 = add nuw nsw i32 %Idx.0, 1
+  %scevgep8 = getelementptr i8, ptr addrspace(1) %lsr.iv7, i64 1
+  %scevgep10 = getelementptr i8, ptr addrspace(4) %lsr.iv9, i64 1
+  br label %for.cond40
+}
+
+declare extern_weak dso_local spir_func i32 @_Z29__spirv_AtomicCompareExchangePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagES5_ii(ptr addrspace(1), i32, i32, i32, i32, i32) local_unnamed_addr 
+declare extern_weak dso_local spir_func void @_Z19__spirv_AtomicStorePU3AS1iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(1), i32, i32, i32) local_unnamed_addr 
+declare i32 @llvm.umin.i32(i32, i32) 
diff --git a/llvm/test/CodeGen/WebAssembly/call-indirect.ll b/llvm/test/CodeGen/WebAssembly/call-indirect.ll
index 55a654f358490..e0a0d14deacba 100644
--- a/llvm/test/CodeGen/WebAssembly/call-indirect.ll
+++ b/llvm/test/CodeGen/WebAssembly/call-indirect.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -mattr=-reference-types -O2 | FileCheck --check-prefixes=CHECK,NOREF %s
-; RUN: llc < %s -asm-verbose=false -mattr=+reference-types -O2 | FileCheck --check-prefixes=CHECK,REF %s
+; RUN: llc < %s -asm-verbose=false -mattr=-reference-types,-call-indirect-overlong -O2 | FileCheck --check-prefixes=CHECK,NOREF %s
+; RUN: llc < %s -asm-verbose=false -mattr=+call-indirect-overlong -O2 | FileCheck --check-prefixes=CHECK,REF %s
 ; RUN: llc < %s -asm-verbose=false -O2 --filetype=obj | obj2yaml | FileCheck --check-prefix=OBJ %s
 
 ; Test that compilation units with call_indirect but without any
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
index 3b312dabcd84d..ab9023cbac604 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory
-; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory | FileCheck %s --check-prefix=NOOPT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
+; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
index 6df626df08883..22fda36c25bfd 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,bulk-memory
-; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -verify-machineinstrs -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory | FileCheck %s --check-prefix=NOOPT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
+; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -verify-machineinstrs -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -wasm-enable-exnref -exception-model=wasm -mattr=+exception-handling,-bulk-memory,-bulk-memory-opt -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/disable-feature.ll b/llvm/test/CodeGen/WebAssembly/disable-feature.ll
index 0684432a114df..5f7275f3699ed 100644
--- a/llvm/test/CodeGen/WebAssembly/disable-feature.ll
+++ b/llvm/test/CodeGen/WebAssembly/disable-feature.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mattr=-sign-ext,-bulk-memory | FileCheck %s
+; RUN: llc < %s -mattr=-sign-ext,-bulk-memory,-bulk-memory-opt | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
@@ -21,7 +21,7 @@ define i8 @not_use_extend8_s(i8 %v, i8 %x) {
   ret i8 %a
 }
 
-attributes #0 = { "target-features"="+bulk-memory," }
+attributes #0 = { "target-features"="+bulk-memory-opt" }
 
 declare void @llvm.memset.p0.i32(ptr, i8, i32, i1)
 
diff --git a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
index d5d10b00824fe..2accd4151767f 100644
--- a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
+++ b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -mattr=-reference-types -O2 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mattr=-reference-types,-call-indirect-overlong -O2 | FileCheck %s
 ; RUN: llc < %s -asm-verbose=false -mattr=+reference-types -O2 | FileCheck --check-prefix=REF %s
-; RUN: llc < %s -asm-verbose=false -mattr=-reference-types -O2 --filetype=obj | obj2yaml | FileCheck --check-prefix=YAML %s
+; RUN: llc < %s -asm-verbose=false -mattr=-reference-types,-call-indirect-overlong -O2 --filetype=obj | obj2yaml | FileCheck --check-prefix=YAML %s
 
 ; This tests pointer features that may codegen differently in wasm64.
 
diff --git a/llvm/test/CodeGen/WebAssembly/reference-types.ll b/llvm/test/CodeGen/WebAssembly/reference-types.ll
index 168aaec8f0943..3df383b023726 100644
--- a/llvm/test/CodeGen/WebAssembly/reference-types.ll
+++ b/llvm/test/CodeGen/WebAssembly/reference-types.ll
@@ -7,7 +7,11 @@ define void @reference-types() {
   ret void
 }
 
-; CHECK:      .int8 1
+; CHECK: .section .custom_section.target_features,"",@
+; CHECK-NEXT: .int8 2
+; CHECK-NEXT: .int8 43
+; CHECK-NEXT: .int8 22
+; CHECK-NEXT: .ascii "call-indirect-overlong"
 ; CHECK-NEXT: .int8 43
 ; CHECK-NEXT: .int8 15
 ; CHECK-NEXT: .ascii "reference-types"
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll b/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll
index 25dee51ac8c38..0e46b96591816 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-attrs.ll
@@ -55,11 +55,14 @@ attributes #2 = { "target-features"="+reference-types" }
 ; Features in function attributes:
 ; +atomics, +nontrapping-fptoint, +reference-types
 ; CHECK-LABEL: .custom_section.target_features,"",@
-; CHECK-NEXT: .int8  3
+; CHECK-NEXT: .int8  4
 ; CHECK-NEXT: .int8  43
 ; CHECK-NEXT: .int8  7
 ; CHECK-NEXT: .ascii  "atomics"
 ; CHECK-NEXT: .int8  43
+; CHECK-NEXT: .int8  22
+; CHECK-NEXT: .ascii  "call-indirect-overlong"
+; CHECK-NEXT: .int8  43
 ; CHECK-NEXT: .int8  19
 ; CHECK-NEXT: .ascii  "nontrapping-fptoint"
 ; CHECK-NEXT: .int8  43
@@ -69,11 +72,14 @@ attributes #2 = { "target-features"="+reference-types" }
 ; Features in function attributes + features specified by -mattr= option:
 ; +atomics, +nontrapping-fptoint, +reference-types, +simd128
 ; SIMD128-LABEL: .custom_section.target_features,"",@
-; SIMD128-NEXT: .int8  4
+; SIMD128-NEXT: .int8  5
 ; SIMD128-NEXT: .int8  43
 ; SIMD128-NEXT: .int8  7
 ; SIMD128-NEXT: .ascii  "atomics"
 ; SIMD128-NEXT: .int8  43
+; SIMD128-NEXT: .int8  22
+; SIMD128-NEXT: .ascii  "call-indirect-overlong"
+; SIMD128-NEXT: .int8  43
 ; SIMD128-NEXT: .int8  19
 ; SIMD128-NEXT: .ascii  "nontrapping-fptoint"
 ; SIMD128-NEXT: .int8  43
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
index ba10dd94a9838..1c77ad5c049a5 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mcpu=mvp | FileCheck %s --check-prefixes MVP
 ; RUN: llc < %s -mcpu=generic | FileCheck %s --check-prefixes GENERIC
+; RUN: llc < %s -mcpu=lime1 | FileCheck %s --check-prefixes LIME1
 ; RUN: llc < %s | FileCheck %s --check-prefixes GENERIC
 ; RUN: llc < %s -mcpu=bleeding-edge | FileCheck %s --check-prefixes BLEEDING-EDGE
 
@@ -11,13 +12,19 @@ target triple = "wasm32-unknown-unknown"
 ; mvp: should not contain the target features section
 ; MVP-NOT: .custom_section.target_features,"",@
 
-; generic: +multivalue, +mutable-globals, +reference-types, +sign-ext
+; generic: +call-indirect-overlong, +multivalue, +mutable-globals, +reference-types, +sign-ext
 ; GENERIC-LABEL: .custom_section.target_features,"",@
-; GENERIC-NEXT: .int8  6
+; GENERIC-NEXT: .int8  8
 ; GENERIC-NEXT: .int8  43
 ; GENERIC-NEXT: .int8  11
 ; GENERIC-NEXT: .ascii  "bulk-memory"
 ; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  15
+; GENERIC-NEXT: .ascii  "bulk-memory-opt"
+; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  22
+; GENERIC-NEXT: .ascii  "call-indirect-overlong"
+; GENERIC-NEXT: .int8  43
 ; GENERIC-NEXT: .int8  10
 ; GENERIC-NEXT: .ascii  "multivalue"
 ; GENERIC-NEXT: .int8  43
@@ -33,12 +40,39 @@ target triple = "wasm32-unknown-unknown"
 ; GENERIC-NEXT: .int8  8
 ; GENERIC-NEXT: .ascii  "sign-ext"
 
-; bleeding-edge: +atomics, +bulk-memory, +exception-handling, +extended-const,
-;                +fp16, +multimemory, +multivalue, +mutable-globals,
-;                +nontrapping-fptoint, +relaxed-simd, +reference-types,
-;                +simd128, +sign-ext, +tail-call
+; lime1: +bulk-memory-opt, +call-indirect-overlong, +extended-const, +multivalue,
+;        +mutable-globals, +nontrapping-fptoint, +sign-ext
+; LIME1-LABEL: .custom_section.target_features,"",@
+; LIME1-NEXT: .int8  7
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  15
+; LIME1-NEXT: .ascii  "bulk-memory-opt"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  22
+; LIME1-NEXT: .ascii  "call-indirect-overlong"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  14
+; LIME1-NEXT: .ascii  "extended-const"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  10
+; LIME1-NEXT: .ascii  "multivalue"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  15
+; LIME1-NEXT: .ascii  "mutable-globals"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  19
+; LIME1-NEXT: .ascii  "nontrapping-fptoint"
+; LIME1-NEXT: .int8  43
+; LIME1-NEXT: .int8  8
+; LIME1-NEXT: .ascii  "sign-ext"
+
+; bleeding-edge: +atomics, +bulk-memory, +bulk-memory-opt,
+;                +call-indirect-overlong, +exception-handling,
+;                +extended-const, +fp16, +multimemory, +multivalue,
+;                +mutable-globals, +nontrapping-fptoint, +relaxed-simd,
+;                +reference-types, +simd128, +sign-ext, +tail-call
 ; BLEEDING-EDGE-LABEL: .section  .custom_section.target_features,"",@
-; BLEEDING-EDGE-NEXT: .int8  14
+; BLEEDING-EDGE-NEXT: .int8  16
 ; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  7
 ; BLEEDING-EDGE-NEXT: .ascii  "atomics"
@@ -46,6 +80,12 @@ target triple = "wasm32-unknown-unknown"
 ; BLEEDING-EDGE-NEXT: .int8  11
 ; BLEEDING-EDGE-NEXT: .ascii  "bulk-memory"
 ; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  15
+; BLEEDING-EDGE-NEXT: .ascii  "bulk-memory-opt"
+; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  22
+; BLEEDING-EDGE-NEXT: .ascii  "call-indirect-overlong"
+; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  18
 ; BLEEDING-EDGE-NEXT: .ascii  "exception-handling"
 ; BLEEDING-EDGE-NEXT: .int8  43
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-tls.ll b/llvm/test/CodeGen/WebAssembly/target-features-tls.ll
index 45bc06b5d5c96..4abe01a73aeee 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-tls.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-tls.ll
@@ -21,11 +21,14 @@ target triple = "wasm32-unknown-unknown"
 
 ; +bulk-memory
 ; BULK-MEM-LABEL: .custom_section.target_features,"",@
-; BULK-MEM-NEXT: .int8 2
+; BULK-MEM-NEXT: .int8 3
 ; BULK-MEM-NEXT: .int8 43
 ; BULK-MEM-NEXT: .int8 7
 ; BULK-MEM-NEXT: .ascii "atomics"
 ; BULK-MEM-NEXT: .int8 43
 ; BULK-MEM-NEXT: .int8 11
 ; BULK-MEM-NEXT: .ascii "bulk-memory"
+; BULK-MEM-NEXT: .int8 43
+; BULK-MEM-NEXT: .int8 15
+; BULK-MEM-NEXT: .ascii "bulk-memory-opt"
 ; BULK-MEM-NEXT: .tbss.foo,"T",@
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 4a6556bdc4a91..494e4bc8e068e 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -85,8 +85,7 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    vcvttpd2qq %xmm1, %xmm1
+; X86-NEXT:    vcvttpd2qq %xmm0, %xmm1
 ; X86-NEXT:    vmovd %xmm1, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 332fbf7188af8..2163121410553 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -141,56 +141,61 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
 define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
 ; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    subq $88, %rsp
-; CHECK-SSE-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SSE-NEXT:    subq $104, %rsp
+; CHECK-SSE-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; CHECK-SSE-NEXT:    pslld $23, %xmm1
 ; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
 ; CHECK-SSE-NEXT:    paddd %xmm2, %xmm1
 ; CHECK-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; CHECK-SSE-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    pslld $16, %xmm1
+; CHECK-SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; CHECK-SSE-NEXT:    pslld $23, %xmm0
 ; CHECK-SSE-NEXT:    paddd %xmm2, %xmm0
 ; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    pslld $16, %xmm0
-; CHECK-SSE-NEXT:    psrld $16, %xmm0
 ; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT:    psrld $16, %xmm0
 ; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    psrlq $48, %xmm0
+; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[3,3,3,3]
-; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-SSE-NEXT:    cvtdq2ps %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-SSE-NEXT:    psrld $16, %xmm0
 ; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    psrlq $48, %xmm0
+; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[2,3,2,3]
+; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
 ; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
@@ -202,39 +207,39 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
@@ -246,14 +251,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; CHECK-SSE-NEXT:    punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm1 = xmm1[0],mem[0]
-; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE-NEXT:    addq $88, %rsp
+; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT:    punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-SSE-NEXT:    addq $104, %rsp
 ; CHECK-SSE-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-SSE-NEXT:    retq
 ;
@@ -1028,17 +1032,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-SSE-NEXT:    pslld $23, %xmm0
 ; CHECK-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,u,u,u,u,u,u]
-; CHECK-SSE-NEXT:    pxor %xmm0, %xmm0
-; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-SSE-NEXT:    cvtdq2ps %xmm1, %xmm0
+; CHECK-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u]
+; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-SSE-NEXT:    psrld $16, %xmm0
+; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = mem[1,1,1,1]
-; CHECK-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    xorps %xmm0, %xmm0
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT:    cvtdq2ps %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1049,8 +1053,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
-; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    addq $40, %rsp
 ; CHECK-SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 6aad4c2ebba1d..2dedb10d42fb4 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -731,7 +731,7 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: stest_f16i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
@@ -894,7 +894,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: utesth_f16i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
 ; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
@@ -1031,7 +1031,7 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: ustest_f16i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
@@ -3343,7 +3343,7 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: stest_f16i32_mm:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
@@ -3504,7 +3504,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: utesth_f16i32_mm:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
 ; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
@@ -3640,7 +3640,7 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
 ;
 ; AVX512-LABEL: ustest_f16i32_mm:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index fe240286462e9..362b3b945f962 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -630,13 +630,8 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    vmovd %eax, %xmm1
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
@@ -651,10 +646,6 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    vmovd %edi, %xmm0
-; X64-NEXT:    orl $1, %edi
-; X64-NEXT:    vmovd %edi, %xmm1
-; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    callq sinf@PLT
diff --git a/llvm/test/CodeGen/X86/no-trap-after-noreturn-fastisel.ll b/llvm/test/CodeGen/X86/no-trap-after-noreturn-fastisel.ll
new file mode 100644
index 0000000000000..5149209f79d15
--- /dev/null
+++ b/llvm/test/CodeGen/X86/no-trap-after-noreturn-fastisel.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -trap-unreachable -no-trap-after-noreturn -fast-isel-abort=3 < %s | FileCheck %s
+
+declare void @foo()
+
+define void @noreturn_unreachable() nounwind {
+; CHECK-LABEL: noreturn_unreachable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo@PLT
+  call void @foo() noreturn
+  unreachable
+}
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 5148d1566c629..62ee0b298ba91 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -4966,8 +4966,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; F16C-LABEL: fptosi_2f16_to_4i32:
 ; F16C:       # %bb.0:
-; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; F16C-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -4975,8 +4973,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; AVX512-LABEL: fptosi_2f16_to_4i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -5104,8 +5100,6 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
 ;
 ; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32:
 ; AVX512-FASTLANE:       # %bb.0:
-; AVX512-FASTLANE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-FASTLANE-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-FASTLANE-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-FASTLANE-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; AVX512-FASTLANE-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
@@ -5212,7 +5206,7 @@ define <4 x i32> @fptoui_4f16_to_4i32(<4 x half> %a) nounwind {
 ;
 ; AVX512F-LABEL: fptoui_4f16_to_4i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll
index 5887834f9af2f..f9b588b8b8915 100644
--- a/llvm/test/CodeGen/X86/widen_conv-3.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-3.ll
@@ -10,7 +10,7 @@ define void @convert_v2i16_to_v2f32(ptr %dst.addr, <2 x i16> %src) nounwind {
 ; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SSE2-NEXT:    psrad $16, %xmm0
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
@@ -26,7 +26,7 @@ define void @convert_v2i16_to_v2f32(ptr %dst.addr, <2 x i16> %src) nounwind {
 ;
 ; X64-SSE2-LABEL: convert_v2i16_to_v2f32:
 ; X64-SSE2:       # %bb.0: # %entry
-; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X64-SSE2-NEXT:    psrad $16, %xmm0
 ; X64-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE2-NEXT:    movlps %xmm0, (%rdi)
diff --git a/llvm/test/MC/Disassembler/SystemZ/insns-pcrel.txt b/llvm/test/MC/Disassembler/SystemZ/insns-pcrel.txt
index ef1d0f1970d16..f12441c9c6cba 100644
--- a/llvm/test/MC/Disassembler/SystemZ/insns-pcrel.txt
+++ b/llvm/test/MC/Disassembler/SystemZ/insns-pcrel.txt
@@ -52,7 +52,7 @@
 0xa7 0xf5 0x7f 0xff
 
 # 0x0000003c:
-# CHECK: brcl 0, 0x3c
+# CHECK: jgnop 0x3c
 0xc0 0x04 0x00 0x00 0x00 0x00
 
 # 0x00000042:
@@ -116,15 +116,15 @@
 0xc0 0xf4 0x00 0x00 0x00 0x00
 
 # 0x0000009c:
-# CHECK: brcl 0, 0x9a
+# CHECK: jgnop 0x9a
 0xc0 0x04 0xff 0xff 0xff 0xff
 
 # 0x000000a2:
-# CHECK: brcl 0, 0xffffffff000000a2
+# CHECK: jgnop 0xffffffff000000a2
 0xc0 0x04 0x80 0x00 0x00 0x00
 
 # 0x000000a8:
-# CHECK: brcl 0, 0x1000000a6
+# CHECK: jgnop 0x1000000a6
 0xc0 0x04 0x7f 0xff 0xff 0xff
 
 # 0x000000ae:
@@ -140,7 +140,7 @@
 0xc0 0xf4 0x7f 0xff 0xff 0xff
 
 # 0x000000c0:
-# CHECK: brc 0, 0xc0
+# CHECK: jnop 0xc0
 0xa7 0x04 0x00 0x00
 
 # 0x000000c4:
@@ -204,15 +204,15 @@
 0xa7 0xf4 0x00 0x00
 
 # 0x00000100:
-# CHECK: brc 0, 0xfe
+# CHECK: jnop 0xfe
 0xa7 0x04 0xff 0xff
 
 # 0x00000104:
-# CHECK: brc 0, 0xffffffffffff0104
+# CHECK: jnop 0xffffffffffff0104
 0xa7 0x04 0x80 0x00
 
 # 0x00000108:
-# CHECK: brc 0, 0x10106
+# CHECK: jnop 0x10106
 0xa7 0x04 0x7f 0xff
 
 # 0x0000010c:
diff --git a/llvm/test/MC/Disassembler/SystemZ/insns.txt b/llvm/test/MC/Disassembler/SystemZ/insns.txt
index 07a1ff6d18388..a4e4a2203a467 100644
--- a/llvm/test/MC/Disassembler/SystemZ/insns.txt
+++ b/llvm/test/MC/Disassembler/SystemZ/insns.txt
@@ -1315,7 +1315,7 @@
 # CHECK: bassm %r15, %r1
 0x0c 0xf1
 
-# CHECK: nop 0
+# CHECK: nop
 0x47 0x00 0x00 0x00
 
 # CHECK: nop 4095
diff --git a/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s b/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
index 734520798baa6..0acbe26d75b15 100644
--- a/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
+++ b/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
@@ -3,7 +3,7 @@
 
 *CHECK: brcl	0, FOO                  * encoding: [0xc0,0x04,A,A,A,A]
 *CHECK:  fixup A - offset: 2, value: FOO+2, kind: FK_390_PC32DBL
-*CHECK: brcl	0, FOO                  * encoding: [0xc0,0x04,A,A,A,A]
+*CHECK: jgnop	FOO                     * encoding: [0xc0,0x04,A,A,A,A]
 *CHECK:  fixup A - offset: 2, value: FOO+2, kind: FK_390_PC32DBL
 	brcl	0,FOO
 	jlnop	FOO
diff --git a/llvm/test/MC/SystemZ/insn-good.s b/llvm/test/MC/SystemZ/insn-good.s
index 09f55049546c2..93f5ff27780ab 100644
--- a/llvm/test/MC/SystemZ/insn-good.s
+++ b/llvm/test/MC/SystemZ/insn-good.s
@@ -1398,35 +1398,35 @@
 
 #CHECK: brc	0, .[[LAB:L.*]]-65536	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, .[[LAB:L.*]]-65536	# encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	.[[LAB:L.*]]-65536	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
 	brc	0, -0x10000
 	jnop	-0x10000
 
 #CHECK: brc	0, .[[LAB:L.*]]-2	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, .[[LAB:L.*]]-2	# encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	.[[LAB:L.*]]-2	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
 	brc	0, -2
 	jnop	-2
 
 #CHECK: brc	0, .[[LAB:L.*]]		# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, .[[LAB:L.*]]		# encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	.[[LAB:L.*]]		# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
 	brc	0, 0
 	jnop	0
 
 #CHECK: brc	0, .[[LAB:L.*]]+65534	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, .[[LAB:L.*]]+65534	# encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	.[[LAB:L.*]]+65534	# encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
 	brc	0, 0xfffe
 	jnop	0xfffe
 
 #CHECK: brc	0, foo                  # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, foo                  # encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	foo                     # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
 	brc	0, foo
 	jnop	foo
@@ -1623,7 +1623,7 @@
 
 #CHECK: brc	0, bar+100              # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, bar+100              # encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	bar+100                 # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
 	brc	0, bar+100
 	jnop	bar+100
@@ -1735,7 +1735,7 @@
 
 #CHECK: brc	0, bar@PLT              # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
-#CHECK: brc	0, bar@PLT              # encoding: [0xa7,0x04,A,A]
+#CHECK: jnop	bar@PLT                 # encoding: [0xa7,0x04,A,A]
 #CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
 	brc	0, bar@PLT
 	jnop	bar@PLT
@@ -1847,32 +1847,32 @@
 
 #CHECK: brcl	0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	.[[LAB:L.*]]-4294967296    # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	brcl	0, -0x100000000
 	jgnop	-0x100000000
 #CHECK: brcl	0, .[[LAB:L.*]]-2	# encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, .[[LAB:L.*]]-2	# encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	.[[LAB:L.*]]-2		# encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL
 	brcl	0, -2
 	jgnop	-2
 #CHECK: brcl	0, .[[LAB:L.*]]		# encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, .[[LAB:L.*]]		# encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	.[[LAB:L.*]]		# encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL
 	brcl	0, 0
 	jgnop	0
 #CHECK: brcl	0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	.[[LAB:L.*]]+4294967294    # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL
 	brcl	0, 0xfffffffe
 	jgnop	0xfffffffe
 
 #CHECK: brcl	0, foo                  # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, foo                  # encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	foo                     # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC32DBL
 	brcl	0, foo
 	jgnop	foo
@@ -2065,7 +2065,7 @@
 
 #CHECK: brcl	0, bar+100              # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, bar+100              # encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	bar+100                 # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL
 	brcl	0, bar+100
 	jgnop	bar+100
@@ -2177,7 +2177,7 @@
 
 #CHECK: brcl	0, bar@PLT              # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC32DBL
-#CHECK: brcl	0, bar@PLT              # encoding: [0xc0,0x04,A,A,A,A]
+#CHECK: jgnop	bar@PLT                 # encoding: [0xc0,0x04,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC32DBL
 	brcl	0, bar@PLT
 	jgnop	bar@PLT
@@ -13142,7 +13142,7 @@
 #CHECK: nop	0                       # encoding: [0x47,0x00,0x00,0x00]
 #CHECK: nop                             # encoding: [0x47,0x00,0x00,0x00]
 #CHECK: nopr	%r7                     # encoding: [0x07,0x07]
-#CHECK: nopr	%r0                 # encoding: [0x07,0x00]
+#CHECK: nopr	                        # encoding: [0x07,0x00]
 
 	nop	0
 	nop
diff --git a/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
index b321c0c82ad4d..6e9c64604e99d 100644
--- a/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
+++ b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -mattr=-bulk-memory -o - | FileCheck %s
-; RUN: llc %s -mattr=-bulk-memory -o - | llvm-mc -triple=wasm32-unknown-unknown | FileCheck %s
+; RUN: llc %s -mattr=-bulk-memory,-bulk-memory-opt -o - | FileCheck %s
+; RUN: llc %s -mattr=-bulk-memory,-bulk-memory-opt -o - | llvm-mc -triple=wasm32-unknown-unknown | FileCheck %s
 
 ; ModuleID = 'test.c'
 source_filename = "test.c"
diff --git a/llvm/test/MC/WebAssembly/function-alias.ll b/llvm/test/MC/WebAssembly/function-alias.ll
index 036cd7d06e063..3f76516a9bcc8 100644
--- a/llvm/test/MC/WebAssembly/function-alias.ll
+++ b/llvm/test/MC/WebAssembly/function-alias.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj %s -mattr=-reference-types -o - | llvm-readobj --symbols - | FileCheck %s
-; RUN: llc -filetype=obj %s -mattr=+reference-types -o - | llvm-readobj --symbols - | FileCheck --check-prefix=REF %s
+; RUN: llc -filetype=obj %s -mattr=-reference-types,-call-indirect-overlong -o - | llvm-readobj --symbols - | FileCheck %s
+; RUN: llc -filetype=obj %s -mattr=+reference-types,-call-indirect-overlong -o - | llvm-readobj --symbols - | FileCheck --check-prefix=REF %s
 
 target triple = "wasm32-unknown-unknown-wasm"
 
diff --git a/llvm/test/MC/WebAssembly/libcall.ll b/llvm/test/MC/WebAssembly/libcall.ll
index ffd32abe2345b..6f36ab7ad317f 100644
--- a/llvm/test/MC/WebAssembly/libcall.ll
+++ b/llvm/test/MC/WebAssembly/libcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -mattr=-bulk-memory %s -o - | obj2yaml | FileCheck %s
+; RUN: llc -filetype=obj -mattr=-bulk-memory,-bulk-memory-opt %s -o - | obj2yaml | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir
index cb73f500ddc21..3c7b70efe7199 100644
--- a/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir
+++ b/llvm/test/MachineVerifier/RISCV/subreg-liveness.mir
@@ -19,8 +19,8 @@ body:             |
     ; CHECK: liveins: $v0, $v8, $v9, $v10, $v11
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    ; CHECK-NEXT: $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16
+    ; CHECK-NEXT: $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16, implicit $vtype
     renamable $v16m2 = PseudoVMV_V_I_M2 undef renamable $v16m2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
-    $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16
+    $v20m2 = VMV2R_V $v14m2, implicit $v12_v13_v14_v15_v16, implicit $vtype
 
 ...
diff --git a/llvm/test/TableGen/intrinsic-overload-conflict.td b/llvm/test/TableGen/intrinsic-overload-conflict.td
index 84333119d41f5..13431c3bc49e0 100644
--- a/llvm/test/TableGen/intrinsic-overload-conflict.td
+++ b/llvm/test/TableGen/intrinsic-overload-conflict.td
@@ -6,13 +6,17 @@ include "llvm/IR/Intrinsics.td"
 // CHECK: foo = 1,
 def int_foo : Intrinsic<[llvm_any_ty]>;
 
-// No conflicts, since .bar is not a vaid mangled type.
+// No conflicts, since .bar is not a valid mangled type.
 // CHECK: foo_bar,
 def int_foo_bar : Intrinsic<[llvm_i32_ty]>;
 
 // CHECK: foo_bar_f32,
 def int_foo_bar_f32 : Intrinsic<[llvm_i32_ty]>;
 
+// No conflicts, since i is not a valid mangled type without a bitwidth.
+// CHECK: foo_i
+def int_foo_i : Intrinsic<[llvm_i32_ty]>;
+
 #ifdef CONFLICT
 // CHECK-CONFLICT: error: intrinsic `llvm.foo.a3` cannot share prefix `llvm.foo.a3` with another overloaded intrinsic `llvm.foo`
 // CHECK-CONFLICT: error: intrinsic `llvm.foo.bf16` cannot share prefix `llvm.foo.bf16` with another overloaded intrinsic `llvm.foo`
diff --git a/llvm/test/ThinLTO/X86/distributed_indexes.ll b/llvm/test/ThinLTO/X86/distributed_indexes.ll
index 4f2662b1b34e1..824c582c2025e 100644
--- a/llvm/test/ThinLTO/X86/distributed_indexes.ll
+++ b/llvm/test/ThinLTO/X86/distributed_indexes.ll
@@ -48,6 +48,10 @@
 ; RUN: llvm-dis %t1.bc.thinlto.bc -o - | FileCheck %s --check-prefix=DIS
 ; DIS: aliasee: null
 
+; RUN: opt -passes=function-import -import-all-index -summary-file=%t1.bc.thinlto.bc %t1.bc -S -o - 2>&1 | FileCheck %s --check-prefix=IR
+; Tests that analias definition is imported.
+; IR: define available_externally void @analias
+
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 declare void @g(...)
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
index 246920e5db0dc..72550fa4d6f0b 100644
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -34,11 +34,14 @@
 ; RUN:   -r=main.bc,main,px \
 ; RUN:   -r=main.bc,small_func, \
 ; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=main.bc,read_write_global_vars, \
+; RUN:   -r=main.bc,external_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
 ; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,read_write_global_vars,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
 ; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
 ; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
@@ -71,13 +74,22 @@
 ; MAIN-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
 ; MAIN-DIS: gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: [[LARGEINDIRECT]])))
 
+; RUN: opt -passes=function-import -import-all-index -summary-file=main.bc.thinlto.bc main.bc -o main-after-import.bc
+; RUN: llvm-dis -o - main-after-import.bc | FileCheck %s --check-prefix=MAIN-IMPORT
+
+; Tests that dso_local attribute is applied on a global var from its summary.
+MAIN-IMPORT: @read_write_global_vars = external dso_local global [1 x ptr]
+
 ; Run in-process ThinLTO and tests that
 ; 1. `callee` remains internalized even if the symbols of its callers
 ; (large_func, large_indirect_callee, large_indirect_bar) are exported as
 ; declarations and visible to main module.
 ; 2. the debugging logs from `function-import` pass are expected.
+; Set relocation model to static so the dso_local attribute from a summary is
+; applied on the global variable declaration.
 
 ; RUN: llvm-lto2 run \
+; RUN:   -relocation-model=static \
 ; RUN:   -debug-only=function-import \
 ; RUN:   -save-temps \
 ; RUN:   -thinlto-threads=1 \
@@ -87,11 +99,14 @@
 ; RUN:   -r=main.bc,main,px \
 ; RUN:   -r=main.bc,small_func, \
 ; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=main.bc,read_write_global_vars, \
+; RUN:   -r=main.bc,external_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
 ; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,read_write_global_vars,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
 ; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
 ; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
@@ -103,7 +118,7 @@
 ; IMPORTDUMP-DAG: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
 ; IMPORTDUMP-DAG: Is importing function definition 6976996067367342685 small_func from lib.cc
 ; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
-; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
+; IMPORTDUMP-DAG: Is importing global declaration 7680325410415171624 calleeAddrs from lib.cc
 ; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
 ; IMPORTDUMP-DAG: Is importing alias declaration 13590951773474913315 large_indirect_bar_alias from lib.cc
 ; IMPORTDUMP-DAG: Not importing function 13770917885399536773 large_indirect_bar
@@ -115,6 +130,8 @@
 ; IMPORT-DAG: define available_externally void @small_func
 ; IMPORT-DAG: define available_externally hidden void @small_indirect_callee
 ; IMPORT-DAG: declare void @large_func
+; Tests that dso_local attribute is applied on a global var from its summary.
+; IMPORT-DAG: @read_write_global_vars = external dso_local global [1 x ptr]
 ; IMPORT-NOT: large_indirect_callee
 ; IMPORT-NOT: large_indirect_callee_alias
 ; IMPORT-NOT: large_indirect_bar
@@ -126,9 +143,14 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@read_write_global_vars = external global [1 x ptr]
+
 define i32 @main() {
   call void @small_func()
   call void @large_func()
+  %num = call ptr @external_func(ptr @read_write_global_vars)
+  store ptr %num, ptr getelementptr inbounds ([1 x ptr], ptr @read_write_global_vars, i64 0, i64 0)
+  %res1 = call i32 @external_func(ptr @read_write_global_vars)
   ret i32 0
 }
 
@@ -137,6 +159,8 @@ declare void @small_func()
 ; large_func without attributes
 declare void @large_func()
 
+declare ptr @external_func(ptr)
+
 ;--- lib.ll
 source_filename = "lib.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -149,6 +173,10 @@ target triple = "x86_64-unknown-linux-gnu"
 ; large_indirect_bar_alias is visible to main.ll but its aliasee isn't.
 @calleeAddrs2 = global [1 x ptr] [ptr @large_indirect_bar_alias]
 
+; @read_write_global_vars is not read-only nor write-only (in main.ll). It's not
+; a constant global var and has references, so it's not importable as a definition.
+@read_write_global_vars = dso_local global [1 x ptr] [ptr @large_indirect_callee]
+
 define void @callee() #1 {
   ret void
 }
diff --git a/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll b/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll
index 88f87f4afab28..57b7b11be0cf1 100644
--- a/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll
+++ b/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll
@@ -38,3 +38,17 @@ exit:
 }
 
 declare void @llvm.assume(i1)
+
+define i1 @negate_overflow_add_1(i64 %x) {
+; CHECK-LABEL: define i1 @negate_overflow_add_1(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i64 [[X]], -9223372036854775807
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 0, [[SUB]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+entry:
+  %sub = add nsw i64 %x, -9223372036854775807
+  %c = icmp slt i64 0, %sub
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
index a4d825b327969..5e2bab28807f5 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
@@ -630,3 +630,68 @@ ptr.check:
 exit:
   ret i4 3
 }
+
+define i1 @test_nusw(ptr %p, i32 %x, i32 %y) {
+; CHECK-LABEL: @test_nusw(
+; CHECK-NEXT:    [[X_EXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y_EXT:%.*]] = zext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[X_EXT]], [[Y_EXT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr nusw i8, ptr [[P:%.*]], i64 [[X_EXT]]
+; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr nusw i8, ptr [[P]], i64 [[Y_EXT]]
+; CHECK-NEXT:    ret i1 true
+;
+  %x.ext = zext i32 %x to i64
+  %y.ext = zext i32 %y to i64
+  %cmp1 = icmp ugt i64 %x.ext, %y.ext
+  call void @llvm.assume(i1 %cmp1)
+  %gep.x = getelementptr nusw i8, ptr %p, i64 %x.ext
+  %gep.y = getelementptr nusw i8, ptr %p, i64 %y.ext
+  %cmp2 = icmp ugt ptr %gep.x, %gep.y
+  ret i1 %cmp2
+}
+
+define i1 @test_nusw_nested(ptr %p, i32 %x, i32 %y) {
+; CHECK-LABEL: @test_nusw_nested(
+; CHECK-NEXT:    [[X_EXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y_EXT:%.*]] = zext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[X_EXT]], [[Y_EXT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr nusw i8, ptr [[P:%.*]], i64 [[X_EXT]]
+; CHECK-NEXT:    [[GEP_X1:%.*]] = getelementptr nusw i8, ptr [[GEP_X]], i64 1
+; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr nusw i8, ptr [[P]], i64 [[Y_EXT]]
+; CHECK-NEXT:    ret i1 true
+;
+  %x.ext = zext i32 %x to i64
+  %y.ext = zext i32 %y to i64
+  %cmp1 = icmp ugt i64 %x.ext, %y.ext
+  call void @llvm.assume(i1 %cmp1)
+  %gep.x = getelementptr nusw i8, ptr %p, i64 %x.ext
+  %gep.x1 = getelementptr nusw i8, ptr %gep.x, i64 1
+  %gep.y = getelementptr nusw i8, ptr %p, i64 %y.ext
+  %cmp2 = icmp ugt ptr %gep.x1, %gep.y
+  ret i1 %cmp2
+}
+
+define i1 @test_missing_nusw(ptr %p, i32 %x, i32 %y) {
+; CHECK-LABEL: @test_missing_nusw(
+; CHECK-NEXT:    [[X_EXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y_EXT:%.*]] = zext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[X_EXT]], [[Y_EXT]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    [[GEP_X:%.*]] = getelementptr nusw i8, ptr [[P:%.*]], i64 [[X_EXT]]
+; CHECK-NEXT:    [[GEP_X1:%.*]] = getelementptr i8, ptr [[GEP_X]], i64 1
+; CHECK-NEXT:    [[GEP_Y:%.*]] = getelementptr nusw i8, ptr [[P]], i64 [[Y_EXT]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[GEP_X1]], [[GEP_Y]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %x.ext = zext i32 %x to i64
+  %y.ext = zext i32 %y to i64
+  %cmp1 = icmp ugt i64 %x.ext, %y.ext
+  call void @llvm.assume(i1 %cmp1)
+  %gep.x = getelementptr nusw i8, ptr %p, i64 %x.ext
+  %gep.x1 = getelementptr i8, ptr %gep.x, i64 1
+  %gep.y = getelementptr nusw i8, ptr %p, i64 %y.ext
+  %cmp2 = icmp ugt ptr %gep.x1, %gep.y
+  ret i1 %cmp2
+}
diff --git a/llvm/test/Transforms/GVN/setjmp.ll b/llvm/test/Transforms/GVN/setjmp.ll
index 0277fcfa226ed..07b7028346760 100644
--- a/llvm/test/Transforms/GVN/setjmp.ll
+++ b/llvm/test/Transforms/GVN/setjmp.ll
@@ -5,7 +5,6 @@ declare i32 @setjmp() returns_twice
 declare void @longjmp()
 declare ptr @malloc(i64)
 
-; FIXME: This is a miscompile.
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
 ; CHECK-NEXT:    [[MALLOC:%.*]] = call noalias ptr @malloc(i64 4)
@@ -18,7 +17,8 @@ define i32 @test() {
 ; CHECK-NEXT:    call void @longjmp()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    ret i32 10
+; CHECK-NEXT:    [[RES:%.*]] = load i32, ptr [[MALLOC]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %malloc = call noalias ptr @malloc(i64 4)
   store i32 10, ptr %malloc, align 4
@@ -35,3 +35,65 @@ if.end:
   %res = load i32, ptr %malloc
   ret i32 %res
 }
+
+; We are still allowed to optimize non-volatile accesses to allocas.
+define i32 @test_alloca() {
+; CHECK-LABEL: define i32 @test_alloca() {
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i43, align 8
+; CHECK-NEXT:    store i32 10, ptr [[ALLOC]], align 4
+; CHECK-NEXT:    [[SJ:%.*]] = call i32 @setjmp()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SJ]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store i32 20, ptr [[ALLOC]], align 4
+; CHECK-NEXT:    call void @longjmp()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    ret i32 10
+;
+  %alloc = alloca i43
+  store i32 10, ptr %alloc, align 4
+  %sj = call i32 @setjmp()
+  %cmp = icmp eq i32 %sj, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 20, ptr %alloc
+  call void @longjmp()
+  unreachable
+
+if.end:
+  %res = load i32, ptr %alloc
+  ret i32 %res
+}
+
+define i32 @test_alloca_volatile() {
+; CHECK-LABEL: define i32 @test_alloca_volatile() {
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i43, align 8
+; CHECK-NEXT:    store volatile i32 10, ptr [[ALLOC]], align 4
+; CHECK-NEXT:    [[SJ:%.*]] = call i32 @setjmp()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SJ]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store volatile i32 20, ptr [[ALLOC]], align 4
+; CHECK-NEXT:    call void @longjmp()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[RES:%.*]] = load volatile i32, ptr [[ALLOC]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %alloc = alloca i43
+  store volatile i32 10, ptr %alloc, align 4
+  %sj = call i32 @setjmp()
+  %cmp = icmp eq i32 %sj, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store volatile i32 20, ptr %alloc
+  call void @longjmp()
+  unreachable
+
+if.end:
+  %res = load volatile i32, ptr %alloc
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/GlobalOpt/malloc-promote-addrspace.ll b/llvm/test/Transforms/GlobalOpt/malloc-promote-addrspace.ll
new file mode 100644
index 0000000000000..bd0957150748f
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/malloc-promote-addrspace.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=globalopt -o - < %s | FileCheck %s
+
+@g = internal addrspace(200) global ptr null, align 8
+
+;.
+; CHECK: @g.init = internal unnamed_addr addrspace(200) global i1 false
+;.
+define internal i32 @f1() {
+; CHECK-LABEL: define {{[^@]+}}@f1() unnamed_addr {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[G_INIT_VAL:%.*]] = load i1, ptr addrspace(200) @g.init, align 1
+; CHECK-NEXT:    call fastcc void @f2()
+; CHECK-NEXT:    [[NOTINIT:%.*]] = xor i1 [[G_INIT_VAL]], true
+; CHECK-NEXT:    br i1 [[NOTINIT]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[I5:%.*]] = phi i32 [ -1, [[BB2]] ], [ 1, [[BB3]] ]
+; CHECK-NEXT:    ret i32 [[I5]]
+;
+bb:
+  %i = load ptr addrspace(200), ptr addrspace(200) @g, align 8
+  call void @f2()
+  %i1 = icmp eq ptr addrspace(200) %i, null
+  br i1 %i1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  br label %bb4
+
+bb3:                                              ; preds = %bb
+  br label %bb4
+
+bb4:                                              ; preds = %bb3, %bb2
+  %i5 = phi i32 [ -1, %bb2 ], [ 1, %bb3 ]
+  ret i32 %i5
+}
+
+define internal void @f2() {
+; CHECK-LABEL: define {{[^@]+}}@f2() unnamed_addr {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i1 true, ptr addrspace(200) @g.init, align 1
+; CHECK-NEXT:    ret void
+;
+bb:
+  %i = call noalias ptr @malloc(i64 4)
+  store ptr %i, ptr addrspace(200) @g, align 8
+  ret void
+}
+
+define dso_local i32 @main() {
+; CHECK-LABEL: define {{[^@]+}}@main() local_unnamed_addr {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    store i1 false, ptr addrspace(200) @g.init, align 1
+; CHECK-NEXT:    [[I:%.*]] = call fastcc i32 @f1()
+; CHECK-NEXT:    ret i32 [[I]]
+;
+bb:
+  store ptr null, ptr addrspace(200) @g, align 8
+  %i = call i32 @f1()
+  ret i32 %i
+}
+
+; Function Attrs: allockind("alloc,uninitialized") allocsize(0)
+declare dso_local noalias ptr @malloc(i64) #0
+
+attributes #0 = { allockind("alloc,uninitialized") allocsize(0) }
diff --git a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
index c49aea3e82b56..0d0af91608e7a 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-fadd-select.ll
@@ -6,7 +6,7 @@
 define float @test_fcmp_ogt_fadd_select_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -19,7 +19,7 @@ define float @test_fcmp_ogt_fadd_select_constant(float %in) {
 define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -32,7 +32,7 @@ define float @test_fcmp_ogt_fadd_select_constant_swapped(float %in) {
 define float @test_fcmp_ogt_fadd_select_neg_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_neg_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -45,7 +45,7 @@ define float @test_fcmp_ogt_fadd_select_neg_constant(float %in) {
 define float @test_fcmp_ogt_fadd_select_fastmath_preserve(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_fastmath_preserve(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -58,7 +58,7 @@ define float @test_fcmp_ogt_fadd_select_fastmath_preserve(float %in) {
 define <2 x float> @test_fcmp_ogt_fadd_select_constant_vectors(<2 x float> %in) {
 ; CHECK-LABEL: define <2 x float> @test_fcmp_ogt_fadd_select_constant_vectors(
 ; CHECK-SAME: <2 x float> [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz <2 x float> [[SEL_NEW]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    ret <2 x float> [[ADD_NEW]]
 ;
@@ -74,7 +74,7 @@ define <2 x float> @test_fcmp_ogt_fadd_select_constant_vectors(<2 x float> %in)
 define float @test_fcmp_olt_fadd_select_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -87,7 +87,7 @@ define float @test_fcmp_olt_fadd_select_constant(float %in) {
 define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -100,7 +100,7 @@ define float @test_fcmp_olt_fadd_select_constant_swapped(float %in) {
 define float @test_fcmp_olt_fadd_select_neg_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_neg_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -113,7 +113,7 @@ define float @test_fcmp_olt_fadd_select_neg_constant(float %in) {
 define float @test_fcmp_olt_fadd_select_fastmath_preserve(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_olt_fadd_select_fastmath_preserve(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -126,7 +126,7 @@ define float @test_fcmp_olt_fadd_select_fastmath_preserve(float %in) {
 define <2 x float> @test_fcmp_olt_fadd_select_constant_vectors(<2 x float> %in) {
 ; CHECK-LABEL: define <2 x float> @test_fcmp_olt_fadd_select_constant_vectors(
 ; CHECK-SAME: <2 x float> [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz <2 x float> [[SEL_NEW]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    ret <2 x float> [[ADD_NEW]]
 ;
@@ -142,7 +142,7 @@ define <2 x float> @test_fcmp_olt_fadd_select_constant_vectors(<2 x float> %in)
 define float @test_fcmp_oge_fadd_select_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -155,7 +155,7 @@ define float @test_fcmp_oge_fadd_select_constant(float %in) {
 define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -168,7 +168,7 @@ define float @test_fcmp_oge_fadd_select_constant_swapped(float %in) {
 define float @test_fcmp_oge_fadd_select_neg_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_neg_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -181,7 +181,7 @@ define float @test_fcmp_oge_fadd_select_neg_constant(float %in) {
 define float @test_fcmp_oge_fadd_select_fastmath_preserve(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_oge_fadd_select_fastmath_preserve(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -194,7 +194,7 @@ define float @test_fcmp_oge_fadd_select_fastmath_preserve(float %in) {
 define <2 x float> @test_fcmp_oge_fadd_select_constant_vectors(<2 x float> %in) {
 ; CHECK-LABEL: define <2 x float> @test_fcmp_oge_fadd_select_constant_vectors(
 ; CHECK-SAME: <2 x float> [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz <2 x float> [[SEL_NEW]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    ret <2 x float> [[ADD_NEW]]
 ;
@@ -210,7 +210,7 @@ define <2 x float> @test_fcmp_oge_fadd_select_constant_vectors(<2 x float> %in)
 define float @test_fcmp_ole_fadd_select_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -223,7 +223,7 @@ define float @test_fcmp_ole_fadd_select_constant(float %in) {
 define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_constant_swapped(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -236,7 +236,7 @@ define float @test_fcmp_ole_fadd_select_constant_swapped(float %in) {
 define float @test_fcmp_ole_fadd_select_neg_constant(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_neg_constant(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -249,7 +249,7 @@ define float @test_fcmp_ole_fadd_select_neg_constant(float %in) {
 define float @test_fcmp_ole_fadd_select_fastmath_preserve(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ole_fadd_select_fastmath_preserve(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.minnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -262,7 +262,7 @@ define float @test_fcmp_ole_fadd_select_fastmath_preserve(float %in) {
 define <2 x float> @test_fcmp_ole_fadd_select_constant_vectors(<2 x float> %in) {
 ; CHECK-LABEL: define <2 x float> @test_fcmp_ole_fadd_select_constant_vectors(
 ; CHECK-SAME: <2 x float> [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[IN]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz <2 x float> [[SEL_NEW]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    ret <2 x float> [[ADD_NEW]]
 ;
@@ -637,7 +637,7 @@ define float @test_fcmp_multiple_uses(float %in) {
 define float @test_fcmp_ogt_fadd_select_rewrite_flags1(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_rewrite_flags1(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call reassoc nnan nsz arcp contract afn float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call reassoc nsz arcp contract afn float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -650,7 +650,7 @@ define float @test_fcmp_ogt_fadd_select_rewrite_flags1(float %in) {
 define float @test_fcmp_ogt_fadd_select_rewrite_flags2(float %in) {
 ; CHECK-LABEL: define float @test_fcmp_ogt_fadd_select_rewrite_flags2(
 ; CHECK-SAME: float [[IN:%.*]]) {
-; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
+; CHECK-NEXT:    [[SEL_NEW:%.*]] = call nsz float @llvm.maxnum.f32(float [[IN]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd nnan nsz float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
@@ -667,7 +667,7 @@ define float @test_fcmp_ogt_fadd_select_rewrite_and_fastmath(float %in) {
 ; CHECK-NEXT:    [[ADD_NEW:%.*]] = fadd fast float [[SEL_NEW]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[ADD_NEW]]
 ;
-  %cmp1 = fcmp ogt float %in, 0.000000e+00
+  %cmp1 = fcmp nnan ogt float %in, 0.000000e+00
   %add = fadd fast reassoc float %in, 1.000000e+00
   %sel = select fast i1 %cmp1, float %add, float 1.000000e+00
   ret float %sel
diff --git a/llvm/test/Transforms/InstCombine/fcmp-select.ll b/llvm/test/Transforms/InstCombine/fcmp-select.ll
index 028de1ff8a99f..408bc1cdc268f 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-select.ll
@@ -219,8 +219,8 @@ define double @test_fcmp_select_clamp(double %x) {
 
 define double @test_fcmp_select_maxnum(double %x) {
 ; CHECK-LABEL: @test_fcmp_select_maxnum(
-; CHECK-NEXT:    [[SEL1:%.*]] = call nnan nsz double @llvm.maxnum.f64(double [[X:%.*]], double 1.000000e+00)
-; CHECK-NEXT:    [[SEL2:%.*]] = call nnan nsz double @llvm.minnum.f64(double [[SEL1]], double 2.550000e+02)
+; CHECK-NEXT:    [[SEL1:%.*]] = call nsz double @llvm.maxnum.f64(double [[X:%.*]], double 1.000000e+00)
+; CHECK-NEXT:    [[SEL2:%.*]] = call nsz double @llvm.minnum.f64(double [[SEL1]], double 2.550000e+02)
 ; CHECK-NEXT:    ret double [[SEL2]]
 ;
   %cmp1 = fcmp ogt double %x, 1.0
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index 9692005edf2b6..549291f2c4f0d 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -1099,7 +1099,7 @@ define float @test_fneg_select_constant_var_multiuse(i1 %cond, float %x) {
 
 define float @test_fneg_select_maxnum(float %x) {
 ; CHECK-LABEL: @test_fneg_select_maxnum(
-; CHECK-NEXT:    [[SEL1:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    [[SEL1:%.*]] = call nsz float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00)
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[SEL1]]
 ; CHECK-NEXT:    ret float [[NEG]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-binop.ll b/llvm/test/Transforms/InstCombine/icmp-binop.ll
index 878f39bb7c9a5..356489716fff9 100644
--- a/llvm/test/Transforms/InstCombine/icmp-binop.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-binop.ll
@@ -252,3 +252,110 @@ false:
   call void @use64(i64 %v)
   ret i1 false
 }
+
+define i1 @test_icmp_sgt_and_negpow2_zero(i32 %add) {
+; CHECK-LABEL: @test_icmp_sgt_and_negpow2_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ADD:%.*]], 7
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -8
+  %cmp = icmp sgt i32 %and, 0
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_slt_and_negpow2_one(i32 %add) {
+; CHECK-LABEL: @test_icmp_slt_and_negpow2_one(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD:%.*]], 8
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -8
+  %cmp = icmp slt i32 %and, 1
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_sgt_and_negpow2_nonzero(i32 %add) {
+; CHECK-LABEL: @test_icmp_sgt_and_negpow2_nonzero(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD:%.*]], -8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[AND]], -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -8
+  %cmp = icmp sgt i32 %and, -2
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_sgt_and_nonnegpow2_zero(i32 %add) {
+; CHECK-LABEL: @test_icmp_sgt_and_nonnegpow2_zero(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD:%.*]], 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, 8
+  %cmp = icmp sgt i32 %and, 0
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_ult_and_negpow2_one(i32 %add) {
+; CHECK-LABEL: @test_icmp_ult_and_negpow2_one(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD:%.*]], 8
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -8
+  %cmp = icmp ult i32 %and, 1
+  ret i1 %cmp
+}
+
+define i1 @test_imply_dom_condition(i32 %add) {
+; CHECK-LABEL: @test_imply_dom_condition(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ADD:%.*]], 7
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 false
+;
+  %and = and i32 %add, -8
+  %cmp = icmp sgt i32 %and, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %min.iters.check = icmp ult i32 %and, 8
+  ret i1 %min.iters.check
+}
+
+define i1 @test_icmp_slt_and_negpow2_c(i32 %add) {
+; CHECK-LABEL: @test_icmp_slt_and_negpow2_c(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD:%.*]], 32
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -32
+  %cmp = icmp slt i32 %and, 16
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_slt_and_negpow2_invalid_c(i32 %add) {
+; CHECK-LABEL: @test_icmp_slt_and_negpow2_invalid_c(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD:%.*]], -32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[AND]], 48
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -32
+  %cmp = icmp slt i32 %and, 48
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_sgt_and_negpow2_c(i32 %add) {
+; CHECK-LABEL: @test_icmp_sgt_and_negpow2_c(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ADD:%.*]], 31
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -32
+  %cmp = icmp sgt i32 %and, 16
+  ret i1 %cmp
+}
+
+define i1 @test_icmp_sgt_and_negpow2_invalid_c(i32 %add) {
+; CHECK-LABEL: @test_icmp_sgt_and_negpow2_invalid_c(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD:%.*]], -32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[AND]], 48
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %and = and i32 %add, -32
+  %cmp = icmp sgt i32 %and, 48
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index 776716fe90873..1bc000cd6ebf1 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -133,6 +133,54 @@ define i1 @ult_base_inbounds(ptr %x, i64 %y) {
   ret i1 %r
 }
 
+define i1 @ult_base_nusw(ptr %x, i64 %y) {
+; CHECK-LABEL: @ult_base_nusw(
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i64 [[Y:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %g = getelementptr nusw i8, ptr %x, i64 %y
+  %r = icmp ult ptr %g, %x
+  ret i1 %r
+}
+
+define i1 @ugt_base_nuw(ptr %x, i64 %y) {
+; CHECK-LABEL: @ugt_base_nuw(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %g = getelementptr nuw i8, ptr %x, i64 %y
+  %r = icmp ugt ptr %g, %x
+  ret i1 %r
+}
+
+define i1 @ugt_base_nusw_nuw(ptr %x, i64 %y) {
+; CHECK-LABEL: @ugt_base_nusw_nuw(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %g = getelementptr nusw nuw i8, ptr %x, i64 %y
+  %r = icmp ugt ptr %g, %x
+  ret i1 %r
+}
+
+define i1 @uge_base_nuw(ptr %x, i64 %y) {
+; CHECK-LABEL: @uge_base_nuw(
+; CHECK-NEXT:    ret i1 true
+;
+  %g = getelementptr nuw i8, ptr %x, i64 %y
+  %r = icmp uge ptr %g, %x
+  ret i1 %r
+}
+
+define i1 @uge_base_nusw_nuw(ptr %x, i64 %y) {
+; CHECK-LABEL: @uge_base_nusw_nuw(
+; CHECK-NEXT:    ret i1 true
+;
+  %g = getelementptr nusw nuw i8, ptr %x, i64 %y
+  %r = icmp uge ptr %g, %x
+  ret i1 %r
+}
+
 define i1 @ugt_base_inbounds_commute(i64 %y) {
 ; CHECK-LABEL: @ugt_base_inbounds_commute(
 ; CHECK-NEXT:    [[X:%.*]] = call ptr @getptr()
@@ -285,6 +333,67 @@ define i1 @test60(ptr %foo, i64 %i, i64 %j) {
   ret i1 %cmp
 }
 
+define i1 @test60_nusw(ptr %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60_nusw(
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[J:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i32, ptr %foo, i64 %i
+  %gep2 = getelementptr nusw i8, ptr %foo, i64 %j
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @test60_nusw_inbounds(ptr %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60_nusw_inbounds(
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[GEP1_IDX]], [[J:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw i32, ptr %foo, i64 %i
+  %gep2 = getelementptr inbounds i8, ptr %foo, i64 %j
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @test60_nuw(ptr %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60_nuw(
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[GEP1_IDX]], [[J:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i32, ptr %foo, i64 %i
+  %gep2 = getelementptr nuw i8, ptr %foo, i64 %j
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @test60_nusw_nuw(ptr %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60_nusw_nuw(
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw nsw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[GEP1_IDX]], [[J:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nusw nuw i32, ptr %foo, i64 %i
+  %gep2 = getelementptr nusw nuw i8, ptr %foo, i64 %j
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
+define i1 @test60_nusw_nuw_mix(ptr %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60_nusw_nuw_mix(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr nuw i32, ptr [[FOO:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr nusw i8, ptr [[FOO]], i64 [[J:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep1 = getelementptr nuw i32, ptr %foo, i64 %i
+  %gep2 = getelementptr nusw i8, ptr %foo, i64 %j
+  %cmp = icmp ult ptr %gep1, %gep2
+  ret i1 %cmp
+}
+
 define i1 @test_gep_ult_no_inbounds(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test_gep_ult_no_inbounds(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[FOO:%.*]], i64 [[I:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index c1b9752607c3d..b266d3e77c434 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -2197,8 +2197,7 @@ define i1 @icmp_ashr_and_overshift(i8 %X) {
 
 define i1 @icmp_and_ashr_neg_and_legal(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_neg_and_legal(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X:%.*]], -32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 32
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
index 00626015d2ed7..ff358c6bc772a 100644
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -1523,3 +1523,65 @@ define <2 x i8> @bool_add_lshr_vec_wrong_shift_amt(<2 x i1> %a, <2 x i1> %b) {
   %lshr = lshr <2 x i8> %add, <i8 1, i8 2>
   ret <2 x i8> %lshr
 }
+
+define i32 @lowbits_of_lshr_mul(i64 %x) {
+; CHECK-LABEL: @lowbits_of_lshr_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[X:%.*]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = mul i32 [[TMP0]], 15
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %mul = mul i64 %x, 64424509440
+  %shift = lshr i64 %mul, 32
+  %conv = trunc i64 %shift to i32
+  ret i32 %conv
+}
+
+define i32 @lowbits_of_lshr_mul_mask(i32 %x) {
+; CHECK-LABEL: @lowbits_of_lshr_mul_mask(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[X:%.*]], 1600
+; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[TMP0]], 32704
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %mul = mul i32 %x, 104857600
+  %shift = lshr i32 %mul, 16
+  %conv = and i32 %shift, 32767
+  ret i32 %conv
+}
+
+; Negative tests
+
+define i32 @lowbits_of_lshr_mul_mask_multiuse(i32 %x) {
+; CHECK-LABEL: @lowbits_of_lshr_mul_mask_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X:%.*]], 104857600
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr exact i32 [[MUL]], 16
+; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[SHIFT]], 32704
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %mul = mul i32 %x, 104857600
+  call void @use(i32 %mul)
+  %shift = lshr i32 %mul, 16
+  %conv = and i32 %shift, 32767
+  ret i32 %conv
+}
+
+define i32 @lowbits_of_lshr_mul_mask_indivisible(i32 %x) {
+; CHECK-LABEL: @lowbits_of_lshr_mul_mask_indivisible(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X:%.*]], 25600
+; CHECK-NEXT:    [[SHIFT:%.*]] = lshr i32 [[MUL]], 16
+; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[SHIFT]], 32767
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %mul = mul i32 %x, 25600
+  %shift = lshr i32 %mul, 16
+  %conv = and i32 %shift, 32767
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll
index 1276b7b3e3867..4fe8cf374344e 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fp.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll
@@ -321,7 +321,7 @@ define double @fneg_fmin(double %x, double %y) {
 
 define float @maxnum_ogt_fmf_on_select(float %a, float %b) {
 ; CHECK-LABEL: @maxnum_ogt_fmf_on_select(
-; CHECK-NEXT:    [[F:%.*]] = call nnan nsz float @llvm.maxnum.f32(float [[A:%.*]], float [[B:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = call nsz float @llvm.maxnum.f32(float [[A:%.*]], float [[B:%.*]])
 ; CHECK-NEXT:    ret float [[F]]
 ;
   %cond = fcmp ogt float %a, %b
@@ -331,7 +331,7 @@ define float @maxnum_ogt_fmf_on_select(float %a, float %b) {
 
 define <2 x float> @maxnum_oge_fmf_on_select(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: @maxnum_oge_fmf_on_select(
-; CHECK-NEXT:    [[F:%.*]] = call nnan ninf nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = call ninf nsz <2 x float> @llvm.maxnum.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]])
 ; CHECK-NEXT:    ret <2 x float> [[F]]
 ;
   %cond = fcmp oge <2 x float> %a, %b
@@ -385,7 +385,7 @@ define float @maxnum_no_nnan(float %a, float %b) {
 
 define float @minnum_olt_fmf_on_select(float %a, float %b) {
 ; CHECK-LABEL: @minnum_olt_fmf_on_select(
-; CHECK-NEXT:    [[F:%.*]] = call nnan nsz float @llvm.minnum.f32(float [[A:%.*]], float [[B:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = call nsz float @llvm.minnum.f32(float [[A:%.*]], float [[B:%.*]])
 ; CHECK-NEXT:    ret float [[F]]
 ;
   %cond = fcmp olt float %a, %b
@@ -395,7 +395,7 @@ define float @minnum_olt_fmf_on_select(float %a, float %b) {
 
 define <2 x float> @minnum_ole_fmf_on_select(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: @minnum_ole_fmf_on_select(
-; CHECK-NEXT:    [[F:%.*]] = call nnan ninf nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]])
+; CHECK-NEXT:    [[F:%.*]] = call ninf nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]])
 ; CHECK-NEXT:    ret <2 x float> [[F]]
 ;
   %cond = fcmp ole <2 x float> %a, %b
diff --git a/llvm/test/Transforms/InstCombine/pr17827.ll b/llvm/test/Transforms/InstCombine/pr17827.ll
index 2f10bb5c7f25f..58b77ec60620e 100644
--- a/llvm/test/Transforms/InstCombine/pr17827.ll
+++ b/llvm/test/Transforms/InstCombine/pr17827.ll
@@ -5,8 +5,7 @@
 define i1 @test_shift_and_cmp_not_changed1(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed1(
 ; CHECK-NEXT:    [[SHLP:%.*]] = shl i8 [[P:%.*]], 5
-; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[SHLP]], 64
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl i8 %p, 5
@@ -18,10 +17,7 @@ define i1 @test_shift_and_cmp_not_changed1(i8 %p) {
 ; With arithmetic right shift, the comparison should not be modified.
 define i1 @test_shift_and_cmp_not_changed2(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_not_changed2(
-; CHECK-NEXT:    [[SHLP:%.*]] = ashr i8 [[P:%.*]], 5
-; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 true
 ;
   %shlp = ashr i8 %p, 5
   %andp = and i8 %shlp, -64
@@ -34,8 +30,7 @@ define i1 @test_shift_and_cmp_not_changed2(i8 %p) {
 define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl i8 [[P:%.*]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[ANDP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[TMP1]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 33
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %andp = and i8 %p, 6
@@ -50,8 +45,7 @@ define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) {
 define <2 x i1> @test_shift_and_cmp_changed1_vec(<2 x i8> %p, <2 x i8> %q) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed1_vec(
 ; CHECK-NEXT:    [[ANDP:%.*]] = shl <2 x i8> [[P:%.*]], splat (i8 5)
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[ANDP]], splat (i8 -64)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], splat (i8 32)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[ANDP]], splat (i8 33)
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %andp = and <2 x i8> %p, <i8 6, i8 6>
@@ -91,9 +85,7 @@ define <2 x i1> @test_shift_and_cmp_changed2_vec(<2 x i8> %p) {
 ; nsw on the shift should not affect the comparison.
 define i1 @test_shift_and_cmp_changed3(i8 %p) {
 ; CHECK-LABEL: @test_shift_and_cmp_changed3(
-; CHECK-NEXT:    [[SHLP:%.*]] = shl nsw i8 [[P:%.*]], 5
-; CHECK-NEXT:    [[ANDP:%.*]] = and i8 [[SHLP]], -64
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ANDP]], 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[P:%.*]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shlp = shl nsw i8 %p, 5
diff --git a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
index b164dd983a892..178795f9f9a83 100644
--- a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
@@ -115,7 +115,7 @@ define float @select_max_ugt_2_use_cmp(float %a, float %b) {
 ; CHECK-LABEL: @select_max_ugt_2_use_cmp(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ugt float [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    call void @foo(i1 [[CMP]])
-; CHECK-NEXT:    [[SEL:%.*]] = call fast float @llvm.maxnum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    [[SEL:%.*]] = call reassoc ninf nsz arcp contract afn float @llvm.maxnum.f32(float [[A]], float [[B]])
 ; CHECK-NEXT:    ret float [[SEL]]
 ;
   %cmp = fcmp reassoc ugt float %a, %b
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
new file mode 100644
index 0000000000000..ab29bf8d2d52a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -0,0 +1,121 @@
+; REQUIRES: asserts
+; RUN: opt < %s -mcpu=neoverse-v2 -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target triple="aarch64--linux-gnu"
+
+; This test shows that comparison and next iteration IV have zero cost if the
+; vector loop gets executed exactly once with the given VF.
+define i64 @test(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'test'
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 26
+; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 50
+; CHECK: LV: Selecting VF: vscale x 2
+entry:
+  br label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %mul, %sum
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+; Same as above, but in the next iteration IV has extra users, and thus, the cost is not zero.
+define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user'
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 26
+; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 50
+; CHECK: LV: Selecting VF: vscale x 2
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %sum, %mul
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+}
+
+; Same as above but with two IVs without extra users. They all have zero cost when VF equals the number of iterations.
+define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'test_two_ivs'
+; CHECK: Cost of 1 for VF 8: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: induction instruction   %j.iv.next = add nuw nsw i64 %j.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 8: 27
+; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %j.iv.next = add nuw nsw i64 %j.iv, 1
+; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+; CHECK-NEXT: Cost of 1 for VF 16: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
+; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK: Cost for VF 16: 51
+; CHECK: LV: Selecting VF: 16
+entry:
+  br label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %j.iv
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %mul, %sum
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %j.iv.next = add nuw nsw i64 %j.iv, 1
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+}
+
+attributes #0 = { vscale_range(1, 16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 1ef01e3b793d5..bf22d63850835 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
 declare void @init(ptr nocapture nofree)
 
@@ -200,104 +200,6 @@ loop_exit:
 }
 
 
-; Test where offset relative to alloca is negative and we shouldn't
-; treat predicated loads as being always dereferenceable.
-define i8 @test_negative_off(i16 %len, ptr %test_base) {
-; CHECK-LABEL: @test_negative_off(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
-; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
-; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
-;
-entry:
-  %alloca = alloca [64638 x i8]
-  call void @init(ptr %alloca)
-  br label %loop
-loop:
-  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
-  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
-  %iv.next = add i16 %iv, 1
-  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
-  %earlycnd = load i1, ptr %test_addr
-  br i1 %earlycnd, label %pred, label %latch
-pred:
-  %addr = getelementptr i8, ptr %alloca, i16 %iv
-  %val = load i8, ptr %addr
-  br label %latch
-latch:
-  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
-  %accum.next = add i8 %accum, %val.phi
-  %exit = icmp ugt i16 %iv, -990
-  br i1 %exit, label %loop_exit, label %loop
-loop_exit:
-  ret i8 %accum.next
-}
-
-
 define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) {
 ; CHECK-LABEL: @loop_requires_scev_predicate(
 ; CHECK-NEXT:  entry:
@@ -423,3 +325,430 @@ for.inc:
 exit:
   ret i32 0
 }
+
+
+; Test reverse loops where we should be able to prove loads in predicated blocks
+; are safe to load unconditionally.
+define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nsw i64 %iv, -1
+  %cmp2.not = icmp eq i64 %iv, 0
+  br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test reverse loops where we *cannot* prove loads in predicated blocks are safe
+; to load unconditionally.
+define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_non_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1023, i64 1022>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw i32 [[TMP10]], 2
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nsw i32 [[TMP17]], 2
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[OFF:%.*]] = add i64 [[IV]], -1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.inc ]
+  %off = add i64 %iv, -1
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %off
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %off
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %off
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nsw i64 %iv, -1
+  %cmp2.not = icmp eq i64 %iv, 0
+  br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+  ret void
+}
+
+
+; Test a loop with a positive step recurrence that has a strided access
+define i16 @test_strided_access(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @test_strided_access(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [163840 x i16], align 4
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i16> poison, i16 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i16> [[TMP11]], i16 [[TMP10]], i32 1
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[TMP12]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP13]] = add <2 x i16> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
+; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [163840 x i16], align 4
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %accum = phi i16 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i64 %iv, 1
+  %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %iv
+  %l.t = load i8, ptr %test_addr
+  %cmp = icmp sge i8 %l.t, 0
+  br i1 %cmp, label %pred, label %latch
+pred:
+  %iv.stride = mul i64 %iv, 2
+  %addr = getelementptr inbounds i16, ptr %alloca, i64 %iv.stride
+  %val = load i16, ptr %addr, align 2
+  br label %latch
+latch:
+  %val.phi = phi i16 [0, %loop], [%val, %pred]
+  %accum.next = add i16 %accum, %val.phi
+  %exit = icmp eq i64 %iv, 4095
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  ret i16 %accum.next
+}
+
+
+; Test a loop with a negative step recurrence that has a strided access
+define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: @test_rev_loops_strided_deref_loads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOCAL_DEST:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_SRC:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[LOCAL_CMP:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_SRC]])
+; CHECK-NEXT:    call void @init(ptr [[LOCAL_CMP]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 511, i64 510>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 511, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IV_STRIDED:%.*]] = mul i64 [[IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV_STRIDED]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %local_dest = alloca [1024 x i32], align 4
+  %local_src = alloca [1024 x i32], align 4
+  %local_cmp = alloca [1024 x i32], align 4
+  call void @init(ptr %local_src)
+  call void @init(ptr %local_cmp)
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 511, %entry ], [ %iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr %local_cmp, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp3.not = icmp eq i32 %0, 3
+  br i1 %cmp3.not, label %for.inc, label %if.then
+
+if.then:
+  %iv.strided = mul i64 %iv, 2
+  %arrayidx5 = getelementptr inbounds [1024 x i32], ptr %local_src, i64 0, i64 %iv.strided
+  %1 = load i32, ptr %arrayidx5, align 4
+  %mul = shl nsw i32 %1, 2
+  %arrayidx7 = getelementptr inbounds [1024 x i32], ptr %local_dest, i64 0, i64 %iv
+  store i32 %mul, ptr %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:
+  %iv.next = add nsw i64 %iv, -1
+  %cmp2.not = icmp eq i64 %iv, 0
+  br i1 %cmp2.not, label %exit, label %for.body
+
+exit:
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %local_dest, i64 1024, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
new file mode 100644
index 0000000000000..1dd526df503bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1-p:16:16:16:16"
+
+declare void @init(ptr nocapture nofree)
+
+
+; Test where offset relative to alloca is negative and we shouldn't
+; treat predicated loads as being always dereferenceable.
+define i8 @test_negative_off(i16 %len, ptr %test_base) {
+; CHECK-LABEL: @test_negative_off(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [64638 x i8], align 1
+; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -1000, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE:%.*]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i1, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i8> [[TMP12]], i8 [[TMP15]], i32 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i8> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i8> [[TMP17]], <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[TMP18]] = add <2 x i8> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
+; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
+; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
+; CHECK:       pred:
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
+; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
+;
+entry:
+  %alloca = alloca [64638 x i8]
+  call void @init(ptr %alloca)
+  br label %loop
+loop:
+  %iv = phi i16 [ -1000, %entry ], [ %iv.next, %latch ]
+  %accum = phi i8 [ 0, %entry ], [ %accum.next, %latch ]
+  %iv.next = add i16 %iv, 1
+  %test_addr = getelementptr inbounds i1, ptr %test_base, i16 %iv
+  %earlycnd = load i1, ptr %test_addr
+  br i1 %earlycnd, label %pred, label %latch
+pred:
+  %addr = getelementptr i8, ptr %alloca, i16 %iv
+  %val = load i8, ptr %addr
+  br label %latch
+latch:
+  %val.phi = phi i8 [ 0, %loop ], [ %val, %pred ]
+  %accum.next = add i8 %accum, %val.phi
+  %exit = icmp ugt i16 %iv, -990
+  br i1 %exit, label %loop_exit, label %loop
+loop_exit:
+  ret i8 %accum.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 7b66440a7fdcc..301526cf3070c 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1,320 +1,1323 @@
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 --check-prefix=CHECK
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 --check-prefix=CHECK
-
-define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
-; CHECK-LABEL: @select_const_i32_from_icmp
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
-; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
-
-; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
-; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, splat (i32 3)
-; CHECK-VF4IC4-NEXT:   [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:   [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:   [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:   [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]]
-; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = or <4 x i1>  [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]])
-; CHECK-VF4IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
-
-
-; CHECK-VF1IC4:      vector.body:
-; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
-; CHECK-VF1IC4:        [[VEC_LOAD1:%.*]] = load i32
-; CHECK-VF1IC4-NEXT:   [[VEC_LOAD2:%.*]] = load i32
-; CHECK-VF1IC4-NEXT:   [[VEC_LOAD3:%.*]] = load i32
-; CHECK-VF1IC4-NEXT:   [[VEC_LOAD4:%.*]] = load i32
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP1:%.*]] = icmp eq i32 [[VEC_LOAD1]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
-; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
-; CHECK-VF1IC4-NEXT:   [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true
-; CHECK-VF1IC4-NEXT:   [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true
-; CHECK-VF1IC4-NEXT:   [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true
-; CHECK-VF1IC4-NEXT:   [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]]
-; CHECK-VF1IC4:      middle.block:
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF1IC4-NEXT:   [[OR_RDX:%.*]] = or i1  [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF1IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VF4IC1
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VF4IC4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VF1IC4
 
+define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_const_i32_from_icmp(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 7
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_const_i32_from_icmp(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD4]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD5]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD6]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI3]], [[TMP13]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX7:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX8:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX7]]
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX8]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 7
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_const_i32_from_icmp(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP21]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP22]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP23]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i32 7, i32 3
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 7
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds i32, ptr %v, i64 %0
-  %3 = load i32, ptr %2, align 4
-  %4 = icmp eq i32 %3, 3
-  %5 = select i1 %4, i32 %1, i32 7
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 3, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds i32, ptr %v, i64 %iv
+  %load.v.iv = load i32, ptr %gep.v.iv, align 4
+  %cmp.v.iv.3 = icmp eq i32 %load.v.iv, 3
+  %sel = select i1 %cmp.v.iv.3, i32 %rdx, i32 7
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %5
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
-define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) {
-; CHECK-LABEL: @select_const_i32_from_icmp2
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
-; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
-
+define i32 @select_const_i32_from_icmp2(ptr %v, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_const_i32_from_icmp2(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4]] = or <4 x i1> [[VEC_PHI]], [[TMP3]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP6]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 7, i32 [[RDX]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_const_i32_from_icmp2(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD4]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD5]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD6]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10]] = or <4 x i1> [[VEC_PHI]], [[TMP6]]
+; CHECK-VF4IC4-NEXT:    [[TMP11]] = or <4 x i1> [[VEC_PHI1]], [[TMP7]]
+; CHECK-VF4IC4-NEXT:    [[TMP12]] = or <4 x i1> [[VEC_PHI2]], [[TMP8]]
+; CHECK-VF4IC4-NEXT:    [[TMP13]] = or <4 x i1> [[VEC_PHI3]], [[TMP9]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP11]], [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX7:%.*]] = or <4 x i1> [[TMP12]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX8:%.*]] = or <4 x i1> [[TMP13]], [[BIN_RDX7]]
+; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX8]])
+; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = freeze i1 [[TMP15]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP16]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 7, i32 [[RDX]]
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_const_i32_from_icmp2(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP17]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP18]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP19]] = or i1 [[VEC_PHI3]], [[TMP15]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP17]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP18]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP19]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP21:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP21]], i32 7, i32 3
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 7, i32 [[RDX]]
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds i32, ptr %v, i64 %0
-  %3 = load i32, ptr %2, align 4
-  %4 = icmp eq i32 %3, 3
-  %5 = select i1 %4, i32 7, i32 %1
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 3, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds i32, ptr %v, i64 %iv
+  %load.v.iv = load i32, ptr %gep.v.iv, align 4
+  %cmp.v.iv.3 = icmp eq i32 %load.v.iv, 3
+  %sel = select i1 %cmp.v.iv.3, i32 7, i32 %rdx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %5
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
-define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
-; CHECK-LABEL: @select_i32_from_icmp
-; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
-; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], splat (i32 3)
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_i32_from_icmp(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 [[B]], i32 [[A]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_LOAD_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_i32_from_icmp(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD4]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD5]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD6]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI3]], [[TMP13]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX7:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX8:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX7]]
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX8]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[B]], i32 [[A]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_LOAD_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_i32_from_icmp(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP21]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP22]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP23]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i32 [[B]], i32 [[A]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_LOAD_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi i32 [ %a, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds i32, ptr %v, i64 %0
-  %3 = load i32, ptr %2, align 4
-  %4 = icmp eq i32 %3, 3
-  %5 = select i1 %4, i32 %1, i32 %b
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %a, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds i32, ptr %v, i64 %iv
+  %load.v.iv = load i32, ptr %gep.v.iv, align 4
+  %cmp.load.iv.3 = icmp eq i32 %load.v.iv, 3
+  %sel = select i1 %cmp.load.iv.3, i32 %rdx, i32 %b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %5
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
-define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) {
-; CHECK-LABEL: @select_const_i32_from_fcmp_fast
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
-; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], splat (i1 true)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_const_i32_from_fcmp_fast(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_LOAD_IV_3:%.*]] = fcmp fast ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_const_i32_from_fcmp_fast(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI3]], [[TMP13]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX7:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX8:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX7]]
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX8]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 2
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_LOAD_IV_3:%.*]] = fcmp fast ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_const_i32_from_fcmp_fast(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP21]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP22]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP23]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i32 1, i32 2
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_LOAD_IV_3:%.*]] = fcmp fast ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_LOAD_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds float, ptr %v, i64 %0
-  %3 = load float, ptr %2, align 4
-  %4 = fcmp fast ueq float %3, 3.0
-  %5 = select i1 %4, i32 %1, i32 1
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 2, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds float, ptr %v, i64 %iv
+  %load.v.iv = load float, ptr %gep.v.iv, align 4
+  %cmp.load.iv.3 = fcmp fast ueq float %load.v.iv, 3.0
+  %sel = select i1 %cmp.load.iv.3, i32 %rdx, i32 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %5
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
-define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
-; CHECK-LABEL: @select_const_i32_from_fcmp
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
-; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], splat (float 3.000000e+00)
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], splat (i1 true)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
+; CHECK-VF4IC1-LABEL: define i32 @select_const_i32_from_fcmp(
+; CHECK-VF4IC1-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
+; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC1-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC1-NEXT:    [[CMP_V_IV_3:%.*]] = fcmp ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_const_i32_from_fcmp(
+; CHECK-VF4IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
+; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00)
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]]
+; CHECK-VF4IC4-NEXT:    [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]]
+; CHECK-VF4IC4-NEXT:    [[TMP17]] = or <4 x i1> [[VEC_PHI3]], [[TMP13]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP15]], [[TMP14]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX7:%.*]] = or <4 x i1> [[TMP16]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX8:%.*]] = or <4 x i1> [[TMP17]], [[BIN_RDX7]]
+; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX8]])
+; CHECK-VF4IC4-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 2
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF4IC4-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF4IC4-NEXT:    [[CMP_V_IV_3:%.*]] = fcmp ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_const_i32_from_fcmp(
+; CHECK-VF1IC4-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP4]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP21]], [[TMP20]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP22]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP23]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP25:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP25]], i32 1, i32 2
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[IV]]
+; CHECK-VF1IC4-NEXT:    [[LOAD_V_IV:%.*]] = load float, ptr [[GEP_V_IV]], align 4
+; CHECK-VF1IC4-NEXT:    [[CMP_V_IV_3:%.*]] = fcmp ueq float [[LOAD_V_IV]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_V_IV_3]], i32 [[RDX]], i32 1
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds float, ptr %v, i64 %0
-  %3 = load float, ptr %2, align 4
-  %4 = fcmp ueq float %3, 3.0
-  %5 = select i1 %4, i32 %1, i32 1
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 2, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds float, ptr %v, i64 %iv
+  %load.v.iv = load float, ptr %gep.v.iv, align 4
+  %cmp.v.iv.3 = fcmp ueq float %load.v.iv, 3.0
+  %sel = select i1 %cmp.v.iv.3, i32 %rdx, i32 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %5
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
 define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
-; CHECK-LABEL: @select_i32_from_icmp_same_inputs
-; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
-; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NOT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
-; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], splat (i32 3)
-; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], splat (i1 true)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
-; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-LABEL: define i32 @select_i32_from_icmp_same_inputs(
+; CHECK-VF4IC1-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC1:       [[VECTOR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF4IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC1:       [[VECTOR_BODY]]:
+; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-VF4IC1-NEXT:    [[TMP2]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
+; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
+; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i32 [[B]], i32 [[A]]
+; CHECK-VF4IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1:       [[SCALAR_PH]]:
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC1:       [[LOOP]]:
+; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC1-NEXT:    [[CMP_RDX_3:%.*]] = icmp eq i32 [[RDX]], 3
+; CHECK-VF4IC1-NEXT:    [[SEL]] = select i1 [[CMP_RDX_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF4IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4IC1:       [[EXIT]]:
+; CHECK-VF4IC1-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF4IC4-LABEL: define i32 @select_i32_from_icmp_same_inputs(
+; CHECK-VF4IC4-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4IC4:       [[VECTOR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-VF4IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4IC4:       [[VECTOR_BODY]]:
+; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
+; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[TMP8]] = or <4 x i1> [[VEC_PHI3]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF4IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX4:%.*]] = or <4 x i1> [[TMP7]], [[BIN_RDX]]
+; CHECK-VF4IC4-NEXT:    [[BIN_RDX5:%.*]] = or <4 x i1> [[TMP8]], [[BIN_RDX4]]
+; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]])
+; CHECK-VF4IC4-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
+; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[B]], i32 [[A]]
+; CHECK-VF4IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4:       [[SCALAR_PH]]:
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF4IC4:       [[LOOP]]:
+; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF4IC4-NEXT:    [[CMP_RDX_3:%.*]] = icmp eq i32 [[RDX]], 3
+; CHECK-VF4IC4-NEXT:    [[SEL]] = select i1 [[CMP_RDX_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF4IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF4IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4IC4:       [[EXIT]]:
+; CHECK-VF4IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
+; CHECK-VF1IC4-LABEL: define i32 @select_i32_from_icmp_same_inputs(
+; CHECK-VF1IC4-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC4-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF1IC4-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF1IC4:       [[VECTOR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF1IC4:       [[VECTOR_BODY]]:
+; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
+; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
+; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP0]], true
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP1]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP2]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF1IC4:       [[MIDDLE_BLOCK]]:
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX:%.*]] = or i1 [[TMP6]], [[TMP5]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX4:%.*]] = or i1 [[TMP7]], [[BIN_RDX]]
+; CHECK-VF1IC4-NEXT:    [[BIN_RDX5:%.*]] = or i1 [[TMP8]], [[BIN_RDX4]]
+; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = freeze i1 [[BIN_RDX5]]
+; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i32 [[B]], i32 [[A]]
+; CHECK-VF1IC4-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC4-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4:       [[SCALAR_PH]]:
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    br label %[[LOOP:.*]]
+; CHECK-VF1IC4:       [[LOOP]]:
+; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-VF1IC4-NEXT:    [[CMP_RDX_3:%.*]] = icmp eq i32 [[RDX]], 3
+; CHECK-VF1IC4-NEXT:    [[SEL]] = select i1 [[CMP_RDX_3]], i32 [[RDX]], i32 [[B]]
+; CHECK-VF1IC4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-VF1IC4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF1IC4:       [[EXIT]]:
+; CHECK-VF1IC4-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %4, %for.body ]
-  %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
-  %2 = icmp eq i32 %1, 3
-  %3 = select i1 %2, i32 %1, i32 %b
-  %4 = add nuw nsw i64 %0, 1
-  %5 = icmp eq i64 %4, %n
-  br i1 %5, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %a, %entry ], [ %sel, %loop ]
+  %cmp.rdx.3 = icmp eq i32 %rdx, 3
+  %sel = select i1 %cmp.rdx.3, i32 %rdx, i32 %b
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %3
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
-; Negative tests
+;; Negative tests
 
 ; We don't support FP reduction variables at the moment.
-define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) {
-; CHECK: @select_const_f32_from_icmp
-; CHECK-NOT: vector.body
+define float @select_const_f32_from_icmp(ptr %v, i64 %n) {
+; CHECK-LABEL: define float @select_const_f32_from_icmp(
+; CHECK-SAME: ptr [[V:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi fast float [ 3.000000e+00, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_V_IV:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_V_IV:%.*]] = load i32, ptr [[GEP_V_IV]], align 4
+; CHECK-NEXT:    [[CMP_V_IV_3:%.*]] = icmp eq i32 [[LOAD_V_IV]], 3
+; CHECK-NEXT:    [[SEL]] = select fast i1 [[CMP_V_IV_3]], float [[RDX]], float 7.000000e+00
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi float [ [[SEL]], %[[LOOP]] ]
+; CHECK-NEXT:    ret float [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
-  %1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ]
-  %2 = getelementptr inbounds i32, ptr %v, i64 %0
-  %3 = load i32, ptr %2, align 4
-  %4 = icmp eq i32 %3, 3
-  %5 = select fast i1 %4, float %1, float 7.0
-  %6 = add nuw nsw i64 %0, 1
-  %7 = icmp eq i64 %6, %n
-  br i1 %7, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi fast float [ 3.0, %entry ], [ %sel, %loop ]
+  %gep.v.iv = getelementptr inbounds i32, ptr %v, i64 %iv
+  %load.v.iv = load i32, ptr %gep.v.iv, align 4
+  %cmp.v.iv.3 = icmp eq i32 %load.v.iv, 3
+  %sel = select fast i1 %cmp.v.iv.3, float %rdx, float 7.0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret float %5
+exit:                                     ; preds = %loop
+  ret float %sel
 }
 
-
 ; We don't support selecting loop-variant values.
-define i32 @select_variant_i32_from_icmp(ptr nocapture readonly %v1, ptr nocapture readonly %v2, i64 %n) {
-; CHECK-LABEL: @select_variant_i32_from_icmp
-; CHECK-NOT: vector.body
+define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) {
+; CHECK-LABEL: define i32 @select_variant_i32_from_icmp(
+; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4
+; CHECK-NEXT:    [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4
+; CHECK-NEXT:    [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i64 [ 0, %entry ], [ %8, %for.body ]
-  %1 = phi i32 [ 3, %entry ], [ %7, %for.body ]
-  %2 = getelementptr inbounds i32, ptr %v1, i64 %0
-  %3 = load i32, ptr %2, align 4
-  %4 = getelementptr inbounds i32, ptr %v2, i64 %0
-  %5 = load i32, ptr %4, align 4
-  %6 = icmp eq i32 %3, 3
-  %7 = select i1 %6, i32 %1, i32 %5
-  %8 = add nuw nsw i64 %0, 1
-  %9 = icmp eq i64 %8, %n
-  br i1 %9, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ 3, %entry ], [ %sel, %loop ]
+  %gep.v1.iv = getelementptr inbounds i32, ptr %v1, i64 %iv
+  %load.v1.iv = load i32, ptr %gep.v1.iv, align 4
+  %gep.v2.iv = getelementptr inbounds i32, ptr %v2, i64 %iv
+  %load.v2.iv = load i32, ptr %gep.v2.iv, align 4
+  %cmp.v1.iv.3 = icmp eq i32 %load.v1.iv, 3
+  %sel = select i1 %cmp.v1.iv.3, i32 %rdx, i32 %load.v2.iv
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %7
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
 
-
 ; We only support selects where the input comes from the same PHI as the
 ; reduction PHI. In the example below, the select uses the induction
 ; variable input and the icmp uses the reduction PHI.
 define i32 @select_i32_from_icmp_non_redux_phi(i32 %a, i32 %b, i32 %n) {
-; CHECK-LABEL: @select_i32_from_icmp_non_redux_phi
-; CHECK-NOT: vector.body
+; CHECK-LABEL: define i32 @select_i32_from_icmp_non_redux_phi(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[A]], %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CMP_RDX_3:%.*]] = icmp eq i32 [[RDX]], 3
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP_RDX_3]], i32 [[IV]], i32 [[B]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[SEL_LCSSA]]
+;
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                      ; preds = %entry, %for.body
-  %0 = phi i32 [ 0, %entry ], [ %4, %for.body ]
-  %1 = phi i32 [ %a, %entry ], [ %3, %for.body ]
-  %2 = icmp eq i32 %1, 3
-  %3 = select i1 %2, i32 %0, i32 %b
-  %4 = add nuw nsw i32 %0, 1
-  %5 = icmp eq i32 %4, %n
-  br i1 %5, label %exit, label %for.body
+loop:                                      ; preds = %entry, %loop
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi i32 [ %a, %entry ], [ %sel, %loop ]
+  %cmp.rdx.3 = icmp eq i32 %rdx, 3
+  %sel = select i1 %cmp.rdx.3, i32 %iv, i32 %b
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exit.cond = icmp eq i32 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
 
-exit:                                     ; preds = %for.body
-  ret i32 %3
+exit:                                     ; preds = %loop
+  ret i32 %sel
 }
+;.
+; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
+; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
+; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+; CHECK-VF1IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK-VF1IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll b/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll
new file mode 100644
index 0000000000000..c7593e2941518
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=memcpyopt -S %s -verify-memoryssa | FileCheck %s
+
+; Redundant memmove.
+define i32 @redundant_memmove() {
+; CHECK-LABEL: @redundant_memmove(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 104, i1 false)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 100, i1 false)
+  %val = load i32, ptr %array, align 16
+  ret i32 %val
+}
+
+; Used memmove, buffer is reset to zero.
+define i32 @used_memmove_1() {
+; CHECK-LABEL: @used_memmove_1(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    store i32 1, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY]], ptr align 4 [[ARRAY_IDX]], i64 100, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 104, i1 false)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  store i32 1, ptr %array.idx
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 100, i1 false)
+  %val = load i32, ptr %array.idx, align 4
+  ret i32 %val
+}
+
+; Used memmove.
+define i32 @used_memmove_2() {
+; CHECK-LABEL: @used_memmove_2(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    store i32 1, ptr [[ARRAY]], align 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY]], ptr align 4 [[ARRAY_IDX]], i64 100, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 104, i1 false)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  store i32 1, ptr %array
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 100, i1 false)
+  %val = load i32, ptr %array.idx, align 4
+  ret i32 %val
+}
+
+; Used memmove, buffer clobbered by opaque.
+define i32 @used_memmove_3() {
+; CHECK-LABEL: @used_memmove_3(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [25 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 100, i1 false)
+; CHECK-NEXT:    call void @opaque(ptr [[ARRAY]])
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY]], ptr align 4 [[ARRAY_IDX]], i64 96, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [25 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 100, i1 false)
+  call void @opaque(ptr %array)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 96, i1 false)
+  %val = load i32, ptr %array, align 16
+  ret i32 %val
+}
+
+; Redundant memmove, not within the same basic block.
+define i32 @redundant_memmove_different_bbs() {
+; CHECK-LABEL: @redundant_memmove_different_bbs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    br label [[USE:%.*]]
+; CHECK:       use:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  %array = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 104, i1 false)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  br label %use
+
+use:                                              ; preds = %entry
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 100, i1 false)
+  %val = load i32, ptr %array, align 16
+  ret i32 %val
+}
+
+@g_var = global [26 x i32] zeroinitializer, align 16
+
+; Redundant memmove on a global variable.
+define ptr @redundant_memmove_memset_global_variable() {
+; CHECK-LABEL: @redundant_memmove_memset_global_variable(
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 @g_var, i8 0, i64 104, i1 false)
+; CHECK-NEXT:    ret ptr @g_var
+;
+  call void @llvm.memset.p0.i64(ptr align 16 @g_var, i8 0, i64 104, i1 false)
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 @g_var, ptr align 4 getelementptr inbounds nuw (i8, ptr @g_var, i64 4), i64 100, i1 false)
+  ret ptr @g_var
+}
+
+; Memset only partial.
+define i32 @partial_memset() {
+; CHECK-LABEL: @partial_memset(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 92
+; CHECK-NEXT:    store i32 1, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 26, i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY]], ptr align 4 [[ARRAY_IDX_2]], i64 100, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 92
+  store i32 1, ptr %array.idx
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 26, i1 false)
+  %array.idx.2 = getelementptr inbounds i8, ptr %array, i64 4
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx.2, i64 100, i1 false)
+  %val = load i32, ptr %array.idx, align 4
+  ret i32 %val
+}
+
+; Memset length not constant.
+define i32 @memset_length_not_constant(i64 %size) {
+; CHECK-LABEL: @memset_length_not_constant(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 [[SIZE:%.*]], i1 false)
+; CHECK-NEXT:    [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY]], ptr align 4 [[ARRAY_IDX]], i64 100, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 %size, i1 false)
+  %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array, ptr align 4 %array.idx, i64 100, i1 false)
+  %val = load i32, ptr %array, align 16
+  ret i32 %val
+}
+
+; Memmove buffer not memset'd, different buffers.
+define i32 @memset_memmove_dest_buffers_not_alias() {
+; CHECK-LABEL: @memset_memmove_dest_buffers_not_alias(
+; CHECK-NEXT:    [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    [[ARRAY2:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT:    [[ARRAY2_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY2]], i64 4
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 16 [[ARRAY2]], ptr align 4 [[ARRAY2_IDX]], i64 100, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY2]], align 16
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %array = alloca [26 x i32], align 16
+  %array2 = alloca [26 x i32], align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %array, i8 0, i64 104, i1 false)
+  %array2.idx = getelementptr inbounds i8, ptr %array2, i64 4
+  call void @llvm.memmove.p0.p0.i64(ptr align 16 %array2, ptr align 4 %array2.idx, i64 100, i1 false)
+  %val = load i32, ptr %array2, align 16
+  ret i32 %val
+}
+
+declare void @opaque(ptr)
+declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
diff --git a/llvm/test/Transforms/ObjCARC/allocas.ll b/llvm/test/Transforms/ObjCARC/allocas.ll
index be829882ae374..6fe2edf3e2dd4 100644
--- a/llvm/test/Transforms/ObjCARC/allocas.ll
+++ b/llvm/test/Transforms/ObjCARC/allocas.ll
@@ -109,16 +109,16 @@ entry:
 }
 
 
-; CHECK: define void @test1d(ptr %x)
+; CHECK: define void @test1d(ptr %x, i1 %arg)
 ; CHECK: @llvm.objc.retain(ptr %x)
 ; CHECK: @llvm.objc.retain(ptr %x)
 ; CHECK: @llvm.objc.release(ptr %y)
 ; CHECK: @llvm.objc.release(ptr %x)
 ; CHECK: ret void
 ; CHECK: }
-define void @test1d(ptr %x) {
+define void @test1d(ptr %x, i1 %arg) {
 entry:
-  br i1 undef, label %use_allocaA, label %use_allocaB
+  br i1 %arg, label %use_allocaA, label %use_allocaB
 
 use_allocaA:
   %allocaA = alloca ptr
@@ -141,16 +141,16 @@ exit:
   ret void
 }
 
-; CHECK: define void @test1e(ptr %x)
+; CHECK: define void @test1e(ptr %x, i1 %arg)
 ; CHECK: @llvm.objc.retain(ptr %x)
 ; CHECK: @llvm.objc.retain(ptr %x)
 ; CHECK: @llvm.objc.release(ptr %y)
 ; CHECK: @llvm.objc.release(ptr %x)
 ; CHECK: ret void
 ; CHECK: }
-define void @test1e(ptr %x) {
+define void @test1e(ptr %x, i1 %arg) {
 entry:
-  br i1 undef, label %use_allocaA, label %use_allocaB
+  br i1 %arg, label %use_allocaA, label %use_allocaB
 
 use_allocaA:
   %allocaA = alloca ptr, i32 4
diff --git a/llvm/test/Transforms/ObjCARC/basic.ll b/llvm/test/Transforms/ObjCARC/basic.ll
index 0ee59dc8ba6ab..d461bc0af680d 100644
--- a/llvm/test/Transforms/ObjCARC/basic.ll
+++ b/llvm/test/Transforms/ObjCARC/basic.ll
@@ -1761,13 +1761,13 @@ g:
 ; CHECK-LABEL: define void @test39(
 ; CHECK-NOT: @llvm.objc.
 ; CHECK: {{^}}}
-define void @test39(ptr %p) {
+define void @test39(ptr %p, i1 %arg) {
 entry:
   %0 = call ptr @llvm.objc.retain(ptr %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:                                             ; preds = %loop
   call void @llvm.objc.release(ptr %0), !clang.imprecise_release !0
@@ -1779,14 +1779,14 @@ exit:                                             ; preds = %loop
 ; CHECK-LABEL: define void @test39b(
 ; CHECK-NOT: @llvm.objc.
 ; CHECK: {{^}}}
-define void @test39b(ptr %p) {
+define void @test39b(ptr %p, i1 %arg) {
 entry:
   %0 = call ptr @llvm.objc.retain(ptr %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
   store i8 0, ptr %0
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:                                             ; preds = %loop
   call void @llvm.objc.release(ptr %0), !clang.imprecise_release !0
@@ -1798,14 +1798,14 @@ exit:                                             ; preds = %loop
 ; CHECK-LABEL: define void @test39c(
 ; CHECK-NOT: @llvm.objc.
 ; CHECK: {{^}}}
-define void @test39c(ptr %p) {
+define void @test39c(ptr %p, i1 %arg) {
 entry:
   %0 = call ptr @llvm.objc.retain(ptr %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
   call void @use_pointer(ptr %0)
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:                                             ; preds = %loop
   call void @llvm.objc.release(ptr %0), !clang.imprecise_release !0
@@ -1818,14 +1818,14 @@ exit:                                             ; preds = %loop
 ; CHECK-LABEL: define void @test40(
 ; CHECK-NOT: @llvm.objc.
 ; CHECK: {{^}}}
-define void @test40(ptr %p) {
+define void @test40(ptr %p, i1 %arg) {
 entry:
   %0 = call ptr @llvm.objc.retain(ptr %p)
   br label %loop
 
 loop:                                             ; preds = %loop, %entry
   call void @use_pointer(ptr %0)
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:                                             ; preds = %loop
   call void @llvm.objc.release(ptr %0), !clang.imprecise_release !0
diff --git a/llvm/test/Transforms/ObjCARC/cfg-hazards.ll b/llvm/test/Transforms/ObjCARC/cfg-hazards.ll
index 3e762de689d1f..d43f713d925e0 100644
--- a/llvm/test/Transforms/ObjCARC/cfg-hazards.ll
+++ b/llvm/test/Transforms/ObjCARC/cfg-hazards.ll
@@ -111,7 +111,7 @@ for.end:                                          ; preds = %for.body
 }
 
 ; Delete nested retain+release pairs around loops.
-define void @test3(ptr %a) nounwind {
+define void @test3(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
@@ -119,7 +119,7 @@ define void @test3(ptr %a) nounwind {
 ; CHECK:       loop:
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
-; CHECK-NEXT:    br i1 undef, label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
@@ -132,7 +132,7 @@ entry:
 loop:
   call void @callee()
   store i8 0, ptr %a
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -140,7 +140,7 @@ exit:
   ret void
 }
 
-define void @test4(ptr %a) nounwind {
+define void @test4(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
@@ -151,7 +151,7 @@ define void @test4(ptr %a) nounwind {
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
-; CHECK-NEXT:    br i1 undef, label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
@@ -168,7 +168,7 @@ more:
   call void @callee()
   call void @callee()
   store i8 0, ptr %a
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -176,18 +176,18 @@ exit:
   ret void
 }
 
-define void @test5(ptr %a) nounwind {
+define void @test5(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @use_pointer(ptr [[A]])
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
@@ -200,13 +200,13 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @use_pointer(ptr %a)
@@ -215,18 +215,18 @@ exit:
   ret void
 }
 
-define void @test6(ptr %a) nounwind {
+define void @test6(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @use_pointer(ptr [[A]])
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
@@ -238,14 +238,14 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   call void @callee()
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @use_pointer(ptr %a)
@@ -254,19 +254,19 @@ exit:
   ret void
 }
 
-define void @test7(ptr %a) nounwind {
+define void @test7(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    call void @use_pointer(ptr [[A]])
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
@@ -278,14 +278,14 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   call void @use_pointer(ptr %a)
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -293,19 +293,19 @@ exit:
   ret void
 }
 
-define void @test8(ptr %a) nounwind {
+define void @test8(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    call void @use_pointer(ptr [[A]])
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
@@ -316,7 +316,7 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   call void @callee()
@@ -324,7 +324,7 @@ true:
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -332,17 +332,17 @@ exit:
   ret void
 }
 
-define void @test9(ptr %a) nounwind {
+define void @test9(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    call void @use_pointer(ptr [[A:%.*]])
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -352,14 +352,14 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   call void @use_pointer(ptr %a)
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -367,17 +367,17 @@ exit:
   ret void
 }
 
-define void @test10(ptr %a) nounwind {
+define void @test10(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    call void @callee()
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -387,14 +387,14 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   call void @callee()
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -402,16 +402,16 @@ exit:
   ret void
 }
 
-define void @test11(ptr %a) nounwind {
+define void @test11(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    br label [[MORE]]
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -421,13 +421,13 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   br label %more
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -437,18 +437,18 @@ exit:
 
 ; Don't delete anything if they're not balanced.
 
-define void @test12(ptr %a) nounwind {
+define void @test12(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUTER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A:%.*]]) #[[ATTR0]]
 ; CHECK-NEXT:    [[INNER:%.*]] = tail call ptr @llvm.objc.retain(ptr [[A]]) #[[ATTR0]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[TRUE:%.*]], label [[MORE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[TRUE:%.*]], label [[MORE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    ret void
 ; CHECK:       more:
-; CHECK-NEXT:    br i1 undef, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 %arg, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
@@ -460,13 +460,13 @@ entry:
   br label %loop
 
 loop:
-  br i1 undef, label %true, label %more
+  br i1 %arg, label %true, label %more
 
 true:
   ret void
 
 more:
-  br i1 undef, label %exit, label %loop
+  br i1 %arg, label %exit, label %loop
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind
@@ -479,7 +479,7 @@ exit:
 ; by an alloca.
 ; rdar://12969722
 
-define void @test13(ptr %a) nounwind {
+define void @test13(ptr %a, i1 %arg) nounwind {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BLOCK:%.*]] = alloca ptr, align 8
@@ -491,7 +491,7 @@ define void @test13(ptr %a) nounwind {
 ; CHECK-NEXT:    call void @block_callee(ptr [[BLOCK]])
 ; CHECK-NEXT:    [[RELOADED_A:%.*]] = load ptr, ptr [[BLOCK]], align 8
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[RELOADED_A]]) #[[ATTR0]], !clang.imprecise_release !0
-; CHECK-NEXT:    br i1 undef, label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.objc.release(ptr [[A]]) #[[ATTR0]], !clang.imprecise_release !0
 ; CHECK-NEXT:    ret void
@@ -507,7 +507,7 @@ loop:
   call void @block_callee(ptr %block)
   %reloaded_a = load ptr, ptr %block, align 8
   call void @llvm.objc.release(ptr %reloaded_a) nounwind, !clang.imprecise_release !0
-  br i1 undef, label %loop, label %exit
+  br i1 %arg, label %loop, label %exit
 
 exit:
   call void @llvm.objc.release(ptr %a) nounwind, !clang.imprecise_release !0
diff --git a/llvm/test/Transforms/ObjCARC/contract-testcases.ll b/llvm/test/Transforms/ObjCARC/contract-testcases.ll
index 36fe49382e413..4c842823b54e0 100644
--- a/llvm/test/Transforms/ObjCARC/contract-testcases.ll
+++ b/llvm/test/Transforms/ObjCARC/contract-testcases.ll
@@ -19,13 +19,13 @@ declare i32 @__gxx_personality_sj0(...)
 ; Don't get in trouble on bugpointed code.
 
 ; CHECK-LABEL: define void @test0(
-define void @test0() {
+define void @test0(i1 %arg) {
 bb:
   %tmp1 = tail call ptr @llvm.objc.retainAutoreleasedReturnValue(ptr undef) nounwind
   br label %bb3
 
 bb3:                                              ; preds = %bb2
-  br i1 undef, label %bb6, label %bb4
+  br i1 %arg, label %bb6, label %bb4
 
 bb4:                                              ; preds = %bb3
   switch i64 undef, label %bb5 [
@@ -45,15 +45,15 @@ bb6:                                              ; preds = %bb5, %bb4, %bb4, %b
 ; for the same block, use the exactly same value in each block.
 
 ; CHECK-LABEL: define void @test1(
-; CHECK: br i1 undef, label %bb7, label %bb7
+; CHECK: br i1 %arg, label %bb7, label %bb7
 ; CHECK: bb7:
 ; CHECK: %tmp8 = phi ptr [ %tmp3, %bb ], [ %tmp3, %bb ]
 ; CHECK: }
-define void @test1() {
+define void @test1(i1 %arg) {
 bb:
   %tmp = tail call ptr @objc_msgSend()
   %tmp3 = tail call ptr @llvm.objc.retainAutoreleasedReturnValue(ptr %tmp) nounwind
-  br i1 undef, label %bb7, label %bb7
+  br i1 %arg, label %bb7, label %bb7
 
 bb7:                                              ; preds = %bb6, %bb6, %bb5
   %tmp8 = phi ptr [ %tmp, %bb ], [ %tmp, %bb ]
diff --git a/llvm/test/Transforms/ObjCARC/empty-block.ll b/llvm/test/Transforms/ObjCARC/empty-block.ll
index 80930812fc7e9..be70beeeb731d 100644
--- a/llvm/test/Transforms/ObjCARC/empty-block.ll
+++ b/llvm/test/Transforms/ObjCARC/empty-block.ll
@@ -18,9 +18,9 @@ declare ptr @llvm.objc.autoreleaseReturnValue(ptr)
 ; CHECK: @llvm.objc.autoreleaseReturnValue
 ; CHECK-NOT: @llvm.objc.
 ; CHECK: }
-define ptr @test0(ptr %buffer) nounwind {
+define ptr @test0(ptr %buffer, i1 %arg) nounwind {
   %1 = tail call ptr @llvm.objc.retain(ptr %buffer) nounwind
-  br i1 undef, label %.lr.ph, label %._crit_edge
+  br i1 %arg, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %.lr.ph, %0
   br i1 false, label %.lr.ph, label %._crit_edge
@@ -37,10 +37,10 @@ define ptr @test0(ptr %buffer) nounwind {
 ; CHECK-LABEL: define ptr @test1(
 ; CHECK-NOT: @objc
 ; CHECK: }
-define ptr @test1() nounwind {
+define ptr @test1(i1 %arg) nounwind {
   %buffer = call ptr @foo()
   %1 = tail call ptr @llvm.objc.retain(ptr %buffer) nounwind
-  br i1 undef, label %.lr.ph, label %._crit_edge
+  br i1 %arg, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %.lr.ph, %0
   br i1 false, label %.lr.ph, label %._crit_edge
diff --git a/llvm/test/Transforms/ObjCARC/path-overflow.ll b/llvm/test/Transforms/ObjCARC/path-overflow.ll
index eeef70554d114..431b377481262 100644
--- a/llvm/test/Transforms/ObjCARC/path-overflow.ll
+++ b/llvm/test/Transforms/ObjCARC/path-overflow.ll
@@ -29,9 +29,9 @@ declare i32 @__gxx_personality_sj0(...)
 declare i32 @__objc_personality_v0(...)
 
 
-define hidden void @test1() personality ptr @__gxx_personality_sj0 {
+define hidden void @test1(i1 %arg) personality ptr @__gxx_personality_sj0 {
 entry:
-  br i1 undef, label %msgSend.nullinit, label %msgSend.call
+  br i1 %arg, label %msgSend.nullinit, label %msgSend.call
 
 msgSend.call:                                     ; preds = %entry
   br label %msgSend.cont
@@ -41,7 +41,7 @@ msgSend.nullinit:                                 ; preds = %entry
 
 msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
   %0 = call ptr @llvm.objc.retain(ptr @_unnamed_cfstring) nounwind
-  br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
+  br i1 %arg, label %msgSend.nullinit33, label %msgSend.call32
 
 msgSend.call32:                                   ; preds = %if.end10
   br label %msgSend.cont34
@@ -50,7 +50,7 @@ msgSend.nullinit33:                               ; preds = %if.end10
   br label %msgSend.cont34
 
 msgSend.cont34:                                   ; preds = %msgSend.nullinit33, %msgSend.call32
-  br i1 undef, label %msgSend.nullinit38, label %msgSend.call37
+  br i1 %arg, label %msgSend.nullinit38, label %msgSend.call37
 
 msgSend.call37:                                   ; preds = %msgSend.cont34
   br label %msgSend.cont39
@@ -59,7 +59,7 @@ msgSend.nullinit38:                               ; preds = %msgSend.cont34
   br label %msgSend.cont39
 
 msgSend.cont39:                                   ; preds = %msgSend.nullinit38, %msgSend.call37
-  br i1 undef, label %msgSend.nullinit49, label %msgSend.call48
+  br i1 %arg, label %msgSend.nullinit49, label %msgSend.call48
 
 msgSend.call48:                                   ; preds = %msgSend.cont39
   br label %msgSend.cont50
@@ -68,7 +68,7 @@ msgSend.nullinit49:                               ; preds = %msgSend.cont39
   br label %msgSend.cont50
 
 msgSend.cont50:                                   ; preds = %msgSend.nullinit49, %msgSend.call48
-  br i1 undef, label %msgSend.nullinit61, label %msgSend.call60
+  br i1 %arg, label %msgSend.nullinit61, label %msgSend.call60
 
 msgSend.call60:                                   ; preds = %msgSend.cont50
   br label %msgSend.cont62
@@ -77,7 +77,7 @@ msgSend.nullinit61:                               ; preds = %msgSend.cont50
   br label %msgSend.cont62
 
 msgSend.cont62:                                   ; preds = %msgSend.nullinit61, %msgSend.call60
-  br i1 undef, label %msgSend.nullinit67, label %msgSend.call66
+  br i1 %arg, label %msgSend.nullinit67, label %msgSend.call66
 
 msgSend.call66:                                   ; preds = %msgSend.cont62
   br label %msgSend.cont68
@@ -86,7 +86,7 @@ msgSend.nullinit67:                               ; preds = %msgSend.cont62
   br label %msgSend.cont68
 
 msgSend.cont68:                                   ; preds = %msgSend.nullinit67, %msgSend.call66
-  br i1 undef, label %msgSend.nullinit84, label %msgSend.call83
+  br i1 %arg, label %msgSend.nullinit84, label %msgSend.call83
 
 msgSend.call83:                                   ; preds = %msgSend.cont68
   br label %msgSend.cont85
@@ -95,7 +95,7 @@ msgSend.nullinit84:                               ; preds = %msgSend.cont68
   br label %msgSend.cont85
 
 msgSend.cont85:                                   ; preds = %msgSend.nullinit84, %msgSend.call83
-  br i1 undef, label %msgSend.nullinit90, label %msgSend.call89
+  br i1 %arg, label %msgSend.nullinit90, label %msgSend.call89
 
 msgSend.call89:                                   ; preds = %msgSend.cont85
   br label %msgSend.cont91
@@ -104,7 +104,7 @@ msgSend.nullinit90:                               ; preds = %msgSend.cont85
   br label %msgSend.cont91
 
 msgSend.cont91:                                   ; preds = %msgSend.nullinit90, %msgSend.call89
-  br i1 undef, label %msgSend.nullinit104, label %msgSend.call103
+  br i1 %arg, label %msgSend.nullinit104, label %msgSend.call103
 
 msgSend.call103:                                  ; preds = %msgSend.cont91
   br label %msgSend.cont105
@@ -113,16 +113,16 @@ msgSend.nullinit104:                              ; preds = %msgSend.cont91
   br label %msgSend.cont105
 
 msgSend.cont105:                                  ; preds = %msgSend.nullinit104, %msgSend.call103
-  br i1 undef, label %land.lhs.true, label %if.end123
+  br i1 %arg, label %land.lhs.true, label %if.end123
 
 land.lhs.true:                                    ; preds = %msgSend.cont105
-  br i1 undef, label %if.then117, label %if.end123
+  br i1 %arg, label %if.then117, label %if.end123
 
 if.then117:                                       ; preds = %land.lhs.true
   br label %if.end123
 
 if.end123:                                        ; preds = %if.then117, %land.lhs.true, %msgSend.cont105
-  br i1 undef, label %msgSend.nullinit132, label %msgSend.call131
+  br i1 %arg, label %msgSend.nullinit132, label %msgSend.call131
 
 msgSend.call131:                                  ; preds = %if.end123
   br label %msgSend.cont133
@@ -131,7 +131,7 @@ msgSend.nullinit132:                              ; preds = %if.end123
   br label %msgSend.cont133
 
 msgSend.cont133:                                  ; preds = %msgSend.nullinit132, %msgSend.call131
-  br i1 undef, label %msgSend.nullinit139, label %msgSend.call138
+  br i1 %arg, label %msgSend.nullinit139, label %msgSend.call138
 
 msgSend.call138:                                  ; preds = %msgSend.cont133
   br label %msgSend.cont140
@@ -140,13 +140,13 @@ msgSend.nullinit139:                              ; preds = %msgSend.cont133
   br label %msgSend.cont140
 
 msgSend.cont140:                                  ; preds = %msgSend.nullinit139, %msgSend.call138
-  br i1 undef, label %if.then151, label %if.end157
+  br i1 %arg, label %if.then151, label %if.end157
 
 if.then151:                                       ; preds = %msgSend.cont140
   br label %if.end157
 
 if.end157:                                        ; preds = %if.then151, %msgSend.cont140
-  br i1 undef, label %msgSend.nullinit164, label %msgSend.call163
+  br i1 %arg, label %msgSend.nullinit164, label %msgSend.call163
 
 msgSend.call163:                                  ; preds = %if.end157
   br label %msgSend.cont165
@@ -155,7 +155,7 @@ msgSend.nullinit164:                              ; preds = %if.end157
   br label %msgSend.cont165
 
 msgSend.cont165:                                  ; preds = %msgSend.nullinit164, %msgSend.call163
-  br i1 undef, label %msgSend.nullinit176, label %msgSend.call175
+  br i1 %arg, label %msgSend.nullinit176, label %msgSend.call175
 
 msgSend.call175:                                  ; preds = %msgSend.cont165
   br label %msgSend.cont177
@@ -164,13 +164,13 @@ msgSend.nullinit176:                              ; preds = %msgSend.cont165
   br label %msgSend.cont177
 
 msgSend.cont177:                                  ; preds = %msgSend.nullinit176, %msgSend.call175
-  br i1 undef, label %land.lhs.true181, label %if.end202
+  br i1 %arg, label %land.lhs.true181, label %if.end202
 
 land.lhs.true181:                                 ; preds = %msgSend.cont177
-  br i1 undef, label %if.then187, label %if.end202
+  br i1 %arg, label %if.then187, label %if.end202
 
 if.then187:                                       ; preds = %land.lhs.true181
-  br i1 undef, label %msgSend.nullinit199, label %msgSend.call198
+  br i1 %arg, label %msgSend.nullinit199, label %msgSend.call198
 
 msgSend.call198:                                  ; preds = %if.then187
   br label %msgSend.cont200
@@ -182,7 +182,7 @@ msgSend.cont200:                                  ; preds = %msgSend.nullinit199
   br label %if.end202
 
 if.end202:                                        ; preds = %msgSend.cont200, %land.lhs.true181, %msgSend.cont177
-  br i1 undef, label %msgSend.nullinit236, label %msgSend.call235
+  br i1 %arg, label %msgSend.nullinit236, label %msgSend.call235
 
 msgSend.call235:                                  ; preds = %if.end202
   br label %msgSend.cont237
@@ -191,7 +191,7 @@ msgSend.nullinit236:                              ; preds = %if.end202
   br label %msgSend.cont237
 
 msgSend.cont237:                                  ; preds = %msgSend.nullinit236, %msgSend.call235
-  br i1 undef, label %msgSend.nullinit254, label %msgSend.call253
+  br i1 %arg, label %msgSend.nullinit254, label %msgSend.call253
 
 msgSend.call253:                                  ; preds = %msgSend.cont237
   br label %msgSend.cont255
@@ -200,7 +200,7 @@ msgSend.nullinit254:                              ; preds = %msgSend.cont237
   br label %msgSend.cont255
 
 msgSend.cont255:                                  ; preds = %msgSend.nullinit254, %msgSend.call253
-  br i1 undef, label %msgSend.nullinit269, label %msgSend.call268
+  br i1 %arg, label %msgSend.nullinit269, label %msgSend.call268
 
 msgSend.call268:                                  ; preds = %msgSend.cont255
   br label %msgSend.cont270
@@ -209,7 +209,7 @@ msgSend.nullinit269:                              ; preds = %msgSend.cont255
   br label %msgSend.cont270
 
 msgSend.cont270:                                  ; preds = %msgSend.nullinit269, %msgSend.call268
-  br i1 undef, label %msgSend.nullinit281, label %msgSend.call280
+  br i1 %arg, label %msgSend.nullinit281, label %msgSend.call280
 
 msgSend.call280:                                  ; preds = %msgSend.cont270
   br label %msgSend.cont282
@@ -218,7 +218,7 @@ msgSend.nullinit281:                              ; preds = %msgSend.cont270
   br label %msgSend.cont282
 
 msgSend.cont282:                                  ; preds = %msgSend.nullinit281, %msgSend.call280
-  br i1 undef, label %msgSend.nullinit287, label %msgSend.call286
+  br i1 %arg, label %msgSend.nullinit287, label %msgSend.call286
 
 msgSend.call286:                                  ; preds = %msgSend.cont282
   br label %msgSend.cont288
@@ -227,7 +227,7 @@ msgSend.nullinit287:                              ; preds = %msgSend.cont282
   br label %msgSend.cont288
 
 msgSend.cont288:                                  ; preds = %msgSend.nullinit287, %msgSend.call286
-  br i1 undef, label %msgSend.nullinit303, label %msgSend.call302
+  br i1 %arg, label %msgSend.nullinit303, label %msgSend.call302
 
 msgSend.call302:                                  ; preds = %msgSend.cont288
   br label %msgSend.cont304
@@ -236,7 +236,7 @@ msgSend.nullinit303:                              ; preds = %msgSend.cont288
   br label %msgSend.cont304
 
 msgSend.cont304:                                  ; preds = %msgSend.nullinit303, %msgSend.call302
-  br i1 undef, label %msgSend.nullinit344, label %msgSend.call343
+  br i1 %arg, label %msgSend.nullinit344, label %msgSend.call343
 
 msgSend.call343:                                  ; preds = %msgSend.cont304
   br label %msgSend.cont345
@@ -245,7 +245,7 @@ msgSend.nullinit344:                              ; preds = %msgSend.cont304
   br label %msgSend.cont345
 
 msgSend.cont345:                                  ; preds = %msgSend.nullinit344, %msgSend.call343
-  br i1 undef, label %msgSend.nullinit350, label %msgSend.call349
+  br i1 %arg, label %msgSend.nullinit350, label %msgSend.call349
 
 msgSend.call349:                                  ; preds = %msgSend.cont345
   br label %msgSend.cont351
@@ -254,7 +254,7 @@ msgSend.nullinit350:                              ; preds = %msgSend.cont345
   br label %msgSend.cont351
 
 msgSend.cont351:                                  ; preds = %msgSend.nullinit350, %msgSend.call349
-  br i1 undef, label %msgSend.nullinit366, label %msgSend.call365
+  br i1 %arg, label %msgSend.nullinit366, label %msgSend.call365
 
 msgSend.call365:                                  ; preds = %msgSend.cont351
   br label %msgSend.cont367
@@ -263,7 +263,7 @@ msgSend.nullinit366:                              ; preds = %msgSend.cont351
   br label %msgSend.cont367
 
 msgSend.cont367:                                  ; preds = %msgSend.nullinit366, %msgSend.call365
-  br i1 undef, label %msgSend.nullinit376, label %msgSend.call375
+  br i1 %arg, label %msgSend.nullinit376, label %msgSend.call375
 
 msgSend.call375:                                  ; preds = %msgSend.cont367
   br label %msgSend.cont377
@@ -272,10 +272,10 @@ msgSend.nullinit376:                              ; preds = %msgSend.cont367
   br label %msgSend.cont377
 
 msgSend.cont377:                                  ; preds = %msgSend.nullinit376, %msgSend.call375
-  br i1 undef, label %if.then384, label %if.else401
+  br i1 %arg, label %if.then384, label %if.else401
 
 if.then384:                                       ; preds = %msgSend.cont377
-  br i1 undef, label %msgSend.nullinit392, label %msgSend.call391
+  br i1 %arg, label %msgSend.nullinit392, label %msgSend.call391
 
 msgSend.call391:                                  ; preds = %if.then384
   br label %msgSend.cont393
@@ -287,7 +287,7 @@ msgSend.cont393:                                  ; preds = %msgSend.nullinit392
   br label %if.end418
 
 if.else401:                                       ; preds = %msgSend.cont377
-  br i1 undef, label %msgSend.nullinit409, label %msgSend.call408
+  br i1 %arg, label %msgSend.nullinit409, label %msgSend.call408
 
 msgSend.call408:                                  ; preds = %if.else401
   br label %msgSend.cont410
@@ -299,7 +299,7 @@ msgSend.cont410:                                  ; preds = %msgSend.nullinit409
   br label %if.end418
 
 if.end418:                                        ; preds = %msgSend.cont410, %msgSend.cont393
-  br i1 undef, label %msgSend.nullinit470, label %msgSend.call469
+  br i1 %arg, label %msgSend.nullinit470, label %msgSend.call469
 
 msgSend.call469:                                  ; preds = %if.end418
   br label %msgSend.cont471
@@ -308,7 +308,7 @@ msgSend.nullinit470:                              ; preds = %if.end418
   br label %msgSend.cont471
 
 msgSend.cont471:                                  ; preds = %msgSend.nullinit470, %msgSend.call469
-  br i1 undef, label %msgSend.nullinit484, label %msgSend.call483
+  br i1 %arg, label %msgSend.nullinit484, label %msgSend.call483
 
 msgSend.call483:                                  ; preds = %msgSend.cont471
   br label %msgSend.cont485
@@ -317,7 +317,7 @@ msgSend.nullinit484:                              ; preds = %msgSend.cont471
   br label %msgSend.cont485
 
 msgSend.cont485:                                  ; preds = %msgSend.nullinit484, %msgSend.call483
-  br i1 undef, label %msgSend.nullinit500, label %msgSend.call499
+  br i1 %arg, label %msgSend.nullinit500, label %msgSend.call499
 
 msgSend.call499:                                  ; preds = %msgSend.cont485
   br label %msgSend.cont501
@@ -326,7 +326,7 @@ msgSend.nullinit500:                              ; preds = %msgSend.cont485
   br label %msgSend.cont501
 
 msgSend.cont501:                                  ; preds = %msgSend.nullinit500, %msgSend.call499
-  br i1 undef, label %msgSend.nullinit506, label %msgSend.call505
+  br i1 %arg, label %msgSend.nullinit506, label %msgSend.call505
 
 msgSend.call505:                                  ; preds = %msgSend.cont501
   br label %msgSend.cont507
@@ -340,78 +340,78 @@ msgSend.cont507:                                  ; preds = %msgSend.nullinit506
 }
 
 ; Function Attrs: optsize ssp uwtable
-define void @test2() unnamed_addr align 2 personality ptr @__gxx_personality_sj0 {
+define void @test2(i1 %arg) unnamed_addr align 2 personality ptr @__gxx_personality_sj0 {
 bb:
-  br i1 undef, label %bb3, label %bb2
+  br i1 %arg, label %bb3, label %bb2
 
 bb2:                                              ; preds = %bb
   br label %bb3
 
 bb3:                                              ; preds = %bb2, %bb
-  br i1 undef, label %bb5, label %bb4
+  br i1 %arg, label %bb5, label %bb4
 
 bb4:                                              ; preds = %bb3
   br label %bb5
 
 bb5:                                              ; preds = %bb4, %bb3
-  br i1 undef, label %bb7, label %bb6
+  br i1 %arg, label %bb7, label %bb6
 
 bb6:                                              ; preds = %bb5
   br label %bb7
 
 bb7:                                              ; preds = %bb6, %bb5
-  br i1 undef, label %bb9, label %bb8
+  br i1 %arg, label %bb9, label %bb8
 
 bb8:                                              ; preds = %bb7
   unreachable
 
 bb9:                                              ; preds = %bb7
-  br i1 undef, label %bb11, label %bb10
+  br i1 %arg, label %bb11, label %bb10
 
 bb10:                                             ; preds = %bb9
   br label %bb11
 
 bb11:                                             ; preds = %bb10, %bb9
-  br i1 undef, label %bb13, label %bb12
+  br i1 %arg, label %bb13, label %bb12
 
 bb12:                                             ; preds = %bb11
   br label %bb13
 
 bb13:                                             ; preds = %bb12, %bb11
-  br i1 undef, label %bb15, label %bb14
+  br i1 %arg, label %bb15, label %bb14
 
 bb14:                                             ; preds = %bb13
   br label %bb15
 
 bb15:                                             ; preds = %bb14, %bb13
-  br i1 undef, label %bb17, label %bb16
+  br i1 %arg, label %bb17, label %bb16
 
 bb16:                                             ; preds = %bb15
   br label %bb17
 
 bb17:                                             ; preds = %bb16, %bb15
-  br i1 undef, label %bb19, label %bb18
+  br i1 %arg, label %bb19, label %bb18
 
 bb18:                                             ; preds = %bb17
   br label %bb19
 
 bb19:                                             ; preds = %bb18, %bb17
-  br i1 undef, label %bb222, label %bb20
+  br i1 %arg, label %bb222, label %bb20
 
 bb20:                                             ; preds = %bb19
-  br i1 undef, label %bb222, label %bb21
+  br i1 %arg, label %bb222, label %bb21
 
 bb21:                                             ; preds = %bb20
-  br i1 undef, label %bb22, label %bb30
+  br i1 %arg, label %bb22, label %bb30
 
 bb22:                                             ; preds = %bb21
-  br i1 undef, label %bb23, label %bb32
+  br i1 %arg, label %bb23, label %bb32
 
 bb23:                                             ; preds = %bb22
-  br i1 undef, label %bb24, label %bb34
+  br i1 %arg, label %bb24, label %bb34
 
 bb24:                                             ; preds = %bb23
-  br i1 undef, label %bb26, label %bb25
+  br i1 %arg, label %bb26, label %bb25
 
 bb25:                                             ; preds = %bb24
   br label %bb27
@@ -420,10 +420,10 @@ bb26:                                             ; preds = %bb24
   br label %bb27
 
 bb27:                                             ; preds = %bb26, %bb25
-  br i1 undef, label %bb28, label %bb42
+  br i1 %arg, label %bb28, label %bb42
 
 bb28:                                             ; preds = %bb27
-  br i1 undef, label %bb36, label %bb29
+  br i1 %arg, label %bb36, label %bb29
 
 bb29:                                             ; preds = %bb28
   br label %bb36
@@ -438,7 +438,7 @@ bb34:                                             ; preds = %bb23
   unreachable
 
 bb36:                                             ; preds = %bb29, %bb28
-  br i1 undef, label %bb38, label %bb37
+  br i1 %arg, label %bb38, label %bb37
 
 bb37:                                             ; preds = %bb36
   br label %bb39
@@ -447,7 +447,7 @@ bb38:                                             ; preds = %bb36
   br label %bb39
 
 bb39:                                             ; preds = %bb38, %bb37
-  br i1 undef, label %bb41, label %bb40
+  br i1 %arg, label %bb41, label %bb40
 
 bb40:                                             ; preds = %bb39
   unreachable
@@ -456,19 +456,19 @@ bb41:                                             ; preds = %bb39
   br label %bb42
 
 bb42:                                             ; preds = %bb41, %bb27
-  br i1 undef, label %bb43, label %bb214
+  br i1 %arg, label %bb43, label %bb214
 
 bb43:                                             ; preds = %bb42
-  br i1 undef, label %bb47, label %bb45
+  br i1 %arg, label %bb47, label %bb45
 
 bb45:                                             ; preds = %bb130, %bb128, %bb126, %bb124, %bb122, %bb120, %bb118, %bb116, %bb114, %bb112, %bb110, %bb108, %bb105, %bb102, %bb100, %bb96, %bb94, %bb90, %bb88, %bb84, %bb82, %bb78, %bb76, %bb72, %bb70, %bb66, %bb64, %bb60, %bb58, %bb54, %bb51, %bb43
   unreachable
 
 bb47:                                             ; preds = %bb43
-  br i1 undef, label %bb48, label %bb106
+  br i1 %arg, label %bb48, label %bb106
 
 bb48:                                             ; preds = %bb47
-  br i1 undef, label %bb50, label %bb49
+  br i1 %arg, label %bb50, label %bb49
 
 bb49:                                             ; preds = %bb48
   br label %bb51
@@ -477,16 +477,16 @@ bb50:                                             ; preds = %bb48
   br label %bb51
 
 bb51:                                             ; preds = %bb50, %bb49
-  br i1 undef, label %bb53, label %bb45
+  br i1 %arg, label %bb53, label %bb45
 
 bb53:                                             ; preds = %bb51
-  br i1 undef, label %bb54, label %bb134
+  br i1 %arg, label %bb54, label %bb134
 
 bb54:                                             ; preds = %bb53
-  br i1 undef, label %bb55, label %bb45
+  br i1 %arg, label %bb55, label %bb45
 
 bb55:                                             ; preds = %bb54
-  br i1 undef, label %bb57, label %bb56
+  br i1 %arg, label %bb57, label %bb56
 
 bb56:                                             ; preds = %bb55
   br label %bb58
@@ -495,13 +495,13 @@ bb57:                                             ; preds = %bb55
   br label %bb58
 
 bb58:                                             ; preds = %bb57, %bb56
-  br i1 undef, label %bb60, label %bb45
+  br i1 %arg, label %bb60, label %bb45
 
 bb60:                                             ; preds = %bb58
-  br i1 undef, label %bb61, label %bb45
+  br i1 %arg, label %bb61, label %bb45
 
 bb61:                                             ; preds = %bb60
-  br i1 undef, label %bb63, label %bb62
+  br i1 %arg, label %bb63, label %bb62
 
 bb62:                                             ; preds = %bb61
   br label %bb64
@@ -510,13 +510,13 @@ bb63:                                             ; preds = %bb61
   br label %bb64
 
 bb64:                                             ; preds = %bb63, %bb62
-  br i1 undef, label %bb66, label %bb45
+  br i1 %arg, label %bb66, label %bb45
 
 bb66:                                             ; preds = %bb64
-  br i1 undef, label %bb67, label %bb45
+  br i1 %arg, label %bb67, label %bb45
 
 bb67:                                             ; preds = %bb66
-  br i1 undef, label %bb69, label %bb68
+  br i1 %arg, label %bb69, label %bb68
 
 bb68:                                             ; preds = %bb67
   br label %bb70
@@ -525,13 +525,13 @@ bb69:                                             ; preds = %bb67
   br label %bb70
 
 bb70:                                             ; preds = %bb69, %bb68
-  br i1 undef, label %bb72, label %bb45
+  br i1 %arg, label %bb72, label %bb45
 
 bb72:                                             ; preds = %bb70
-  br i1 undef, label %bb73, label %bb45
+  br i1 %arg, label %bb73, label %bb45
 
 bb73:                                             ; preds = %bb72
-  br i1 undef, label %bb75, label %bb74
+  br i1 %arg, label %bb75, label %bb74
 
 bb74:                                             ; preds = %bb73
   br label %bb76
@@ -540,13 +540,13 @@ bb75:                                             ; preds = %bb73
   br label %bb76
 
 bb76:                                             ; preds = %bb75, %bb74
-  br i1 undef, label %bb78, label %bb45
+  br i1 %arg, label %bb78, label %bb45
 
 bb78:                                             ; preds = %bb76
-  br i1 undef, label %bb79, label %bb45
+  br i1 %arg, label %bb79, label %bb45
 
 bb79:                                             ; preds = %bb78
-  br i1 undef, label %bb81, label %bb80
+  br i1 %arg, label %bb81, label %bb80
 
 bb80:                                             ; preds = %bb79
   br label %bb82
@@ -555,13 +555,13 @@ bb81:                                             ; preds = %bb79
   br label %bb82
 
 bb82:                                             ; preds = %bb81, %bb80
-  br i1 undef, label %bb84, label %bb45
+  br i1 %arg, label %bb84, label %bb45
 
 bb84:                                             ; preds = %bb82
-  br i1 undef, label %bb85, label %bb45
+  br i1 %arg, label %bb85, label %bb45
 
 bb85:                                             ; preds = %bb84
-  br i1 undef, label %bb87, label %bb86
+  br i1 %arg, label %bb87, label %bb86
 
 bb86:                                             ; preds = %bb85
   br label %bb88
@@ -570,13 +570,13 @@ bb87:                                             ; preds = %bb85
   br label %bb88
 
 bb88:                                             ; preds = %bb87, %bb86
-  br i1 undef, label %bb90, label %bb45
+  br i1 %arg, label %bb90, label %bb45
 
 bb90:                                             ; preds = %bb88
-  br i1 undef, label %bb91, label %bb45
+  br i1 %arg, label %bb91, label %bb45
 
 bb91:                                             ; preds = %bb90
-  br i1 undef, label %bb93, label %bb92
+  br i1 %arg, label %bb93, label %bb92
 
 bb92:                                             ; preds = %bb91
   br label %bb94
@@ -585,13 +585,13 @@ bb93:                                             ; preds = %bb91
   br label %bb94
 
 bb94:                                             ; preds = %bb93, %bb92
-  br i1 undef, label %bb96, label %bb45
+  br i1 %arg, label %bb96, label %bb45
 
 bb96:                                             ; preds = %bb94
-  br i1 undef, label %bb97, label %bb45
+  br i1 %arg, label %bb97, label %bb45
 
 bb97:                                             ; preds = %bb96
-  br i1 undef, label %bb99, label %bb98
+  br i1 %arg, label %bb99, label %bb98
 
 bb98:                                             ; preds = %bb97
   br label %bb100
@@ -600,82 +600,82 @@ bb99:                                             ; preds = %bb97
   br label %bb100
 
 bb100:                                            ; preds = %bb99, %bb98
-  br i1 undef, label %bb102, label %bb45
+  br i1 %arg, label %bb102, label %bb45
 
 bb102:                                            ; preds = %bb100
-  br i1 undef, label %bb104, label %bb45
+  br i1 %arg, label %bb104, label %bb45
 
 bb104:                                            ; preds = %bb102
-  br i1 undef, label %bb108, label %bb105
+  br i1 %arg, label %bb108, label %bb105
 
 bb105:                                            ; preds = %bb104
-  br i1 undef, label %bb108, label %bb45
+  br i1 %arg, label %bb108, label %bb45
 
 bb106:                                            ; preds = %bb47
   unreachable
 
 bb108:                                            ; preds = %bb105, %bb104
-  br i1 undef, label %bb110, label %bb45
+  br i1 %arg, label %bb110, label %bb45
 
 bb110:                                            ; preds = %bb108
-  br i1 undef, label %bb112, label %bb45
+  br i1 %arg, label %bb112, label %bb45
 
 bb112:                                            ; preds = %bb110
-  br i1 undef, label %bb114, label %bb45
+  br i1 %arg, label %bb114, label %bb45
 
 bb114:                                            ; preds = %bb112
-  br i1 undef, label %bb116, label %bb45
+  br i1 %arg, label %bb116, label %bb45
 
 bb116:                                            ; preds = %bb114
-  br i1 undef, label %bb118, label %bb45
+  br i1 %arg, label %bb118, label %bb45
 
 bb118:                                            ; preds = %bb116
-  br i1 undef, label %bb120, label %bb45
+  br i1 %arg, label %bb120, label %bb45
 
 bb120:                                            ; preds = %bb118
-  br i1 undef, label %bb122, label %bb45
+  br i1 %arg, label %bb122, label %bb45
 
 bb122:                                            ; preds = %bb120
-  br i1 undef, label %bb124, label %bb45
+  br i1 %arg, label %bb124, label %bb45
 
 bb124:                                            ; preds = %bb122
-  br i1 undef, label %bb126, label %bb45
+  br i1 %arg, label %bb126, label %bb45
 
 bb126:                                            ; preds = %bb124
-  br i1 undef, label %bb128, label %bb45
+  br i1 %arg, label %bb128, label %bb45
 
 bb128:                                            ; preds = %bb126
-  br i1 undef, label %bb130, label %bb45
+  br i1 %arg, label %bb130, label %bb45
 
 bb130:                                            ; preds = %bb128
-  br i1 undef, label %bb132, label %bb45
+  br i1 %arg, label %bb132, label %bb45
 
 bb132:                                            ; preds = %bb130
-  br i1 undef, label %bb135, label %bb30
+  br i1 %arg, label %bb135, label %bb30
 
 bb134:                                            ; preds = %bb53
   unreachable
 
 bb135:                                            ; preds = %bb132
-  br i1 undef, label %bb139, label %bb136
+  br i1 %arg, label %bb139, label %bb136
 
 bb136:                                            ; preds = %bb135
-  br i1 undef, label %bb138, label %bb30
+  br i1 %arg, label %bb138, label %bb30
 
 bb138:                                            ; preds = %bb136
   br label %bb139
 
 bb139:                                            ; preds = %bb138, %bb135
-  br i1 undef, label %bb140, label %bb141
+  br i1 %arg, label %bb140, label %bb141
 
 bb140:                                            ; preds = %bb139
   unreachable
 
 bb141:                                            ; preds = %bb139
-  br i1 undef, label %bb142, label %bb215
+  br i1 %arg, label %bb142, label %bb215
 
 bb142:                                            ; preds = %bb141
-  br i1 undef, label %bb144, label %bb143
+  br i1 %arg, label %bb144, label %bb143
 
 bb143:                                            ; preds = %bb142
   br label %bb145
@@ -684,16 +684,16 @@ bb144:                                            ; preds = %bb142
   br label %bb145
 
 bb145:                                            ; preds = %bb144, %bb143
-  br i1 undef, label %bb146, label %bb151
+  br i1 %arg, label %bb146, label %bb151
 
 bb146:                                            ; preds = %bb145
-  br i1 undef, label %bb148, label %bb153
+  br i1 %arg, label %bb148, label %bb153
 
 bb148:                                            ; preds = %bb146
-  br i1 undef, label %bb155, label %bb149
+  br i1 %arg, label %bb155, label %bb149
 
 bb149:                                            ; preds = %bb148
-  br i1 undef, label %bb150, label %bb153
+  br i1 %arg, label %bb150, label %bb153
 
 bb150:                                            ; preds = %bb149
   br label %bb155
@@ -705,7 +705,7 @@ bb153:                                            ; preds = %bb158, %bb149, %bb1
   unreachable
 
 bb155:                                            ; preds = %bb150, %bb148
-  br i1 undef, label %bb157, label %bb156
+  br i1 %arg, label %bb157, label %bb156
 
 bb156:                                            ; preds = %bb155
   br label %bb158
@@ -714,10 +714,10 @@ bb157:                                            ; preds = %bb155
   br label %bb158
 
 bb158:                                            ; preds = %bb157, %bb156
-  br i1 undef, label %bb160, label %bb153
+  br i1 %arg, label %bb160, label %bb153
 
 bb160:                                            ; preds = %bb158
-  br i1 undef, label %bb162, label %bb161
+  br i1 %arg, label %bb162, label %bb161
 
 bb161:                                            ; preds = %bb160
   br label %bb163
@@ -726,16 +726,16 @@ bb162:                                            ; preds = %bb160
   br label %bb163
 
 bb163:                                            ; preds = %bb162, %bb161
-  br i1 undef, label %bb165, label %bb164
+  br i1 %arg, label %bb165, label %bb164
 
 bb164:                                            ; preds = %bb163
   br label %bb165
 
 bb165:                                            ; preds = %bb164, %bb163
-  br i1 undef, label %bb170, label %bb166
+  br i1 %arg, label %bb170, label %bb166
 
 bb166:                                            ; preds = %bb165
-  br i1 undef, label %bb167, label %bb168
+  br i1 %arg, label %bb167, label %bb168
 
 bb167:                                            ; preds = %bb166
   unreachable
@@ -744,25 +744,25 @@ bb168:                                            ; preds = %bb166
   unreachable
 
 bb170:                                            ; preds = %bb165
-  br i1 undef, label %bb215, label %bb171
+  br i1 %arg, label %bb215, label %bb171
 
 bb171:                                            ; preds = %bb170
-  br i1 undef, label %bb173, label %bb30
+  br i1 %arg, label %bb173, label %bb30
 
 bb173:                                            ; preds = %bb171
-  br i1 undef, label %bb174, label %bb215
+  br i1 %arg, label %bb174, label %bb215
 
 bb174:                                            ; preds = %bb173
-  br i1 undef, label %bb176, label %bb30
+  br i1 %arg, label %bb176, label %bb30
 
 bb176:                                            ; preds = %bb174
-  br i1 undef, label %bb178, label %bb30
+  br i1 %arg, label %bb178, label %bb30
 
 bb178:                                            ; preds = %bb176
-  br i1 undef, label %bb179, label %bb193
+  br i1 %arg, label %bb179, label %bb193
 
 bb179:                                            ; preds = %bb178
-  br i1 undef, label %bb181, label %bb180
+  br i1 %arg, label %bb181, label %bb180
 
 bb180:                                            ; preds = %bb179
   br label %bb182
@@ -771,23 +771,23 @@ bb181:                                            ; preds = %bb179
   br label %bb182
 
 bb182:                                            ; preds = %bb181, %bb180
-  br i1 undef, label %bb184, label %bb30
+  br i1 %arg, label %bb184, label %bb30
 
 bb184:                                            ; preds = %bb182
   %tmp185 = call ptr @returner()
-  br i1 undef, label %bb186, label %bb195
+  br i1 %arg, label %bb186, label %bb195
 
 bb186:                                            ; preds = %bb184
   %tmp188 = call ptr @llvm.objc.retainAutoreleasedReturnValue(ptr %tmp185)
   %tmp189 = call ptr @llvm.objc.retain(ptr %tmp188)
   call void @llvm.objc.release(ptr %tmp189), !clang.imprecise_release !0
-  br i1 undef, label %bb197, label %bb190
+  br i1 %arg, label %bb197, label %bb190
 
 bb190:                                            ; preds = %bb186
-  br i1 undef, label %bb192, label %bb195
+  br i1 %arg, label %bb192, label %bb195
 
 bb192:                                            ; preds = %bb190
-  br i1 undef, label %bb197, label %bb195
+  br i1 %arg, label %bb197, label %bb195
 
 bb193:                                            ; preds = %bb178
   br label %bb213
@@ -796,37 +796,37 @@ bb195:                                            ; preds = %bb192, %bb190, %bb1
   unreachable
 
 bb197:                                            ; preds = %bb192, %bb186
-  br i1 undef, label %bb198, label %bb215
+  br i1 %arg, label %bb198, label %bb215
 
 bb198:                                            ; preds = %bb197
-  br i1 undef, label %bb202, label %bb199
+  br i1 %arg, label %bb202, label %bb199
 
 bb199:                                            ; preds = %bb198
-  br i1 undef, label %bb201, label %bb30
+  br i1 %arg, label %bb201, label %bb30
 
 bb201:                                            ; preds = %bb199
   br label %bb202
 
 bb202:                                            ; preds = %bb201, %bb198
-  br i1 undef, label %bb206, label %bb203
+  br i1 %arg, label %bb206, label %bb203
 
 bb203:                                            ; preds = %bb202
-  br i1 undef, label %bb205, label %bb30
+  br i1 %arg, label %bb205, label %bb30
 
 bb205:                                            ; preds = %bb203
   br label %bb206
 
 bb206:                                            ; preds = %bb205, %bb202
-  br i1 undef, label %bb210, label %bb207
+  br i1 %arg, label %bb210, label %bb207
 
 bb207:                                            ; preds = %bb206
-  br i1 undef, label %bb209, label %bb30
+  br i1 %arg, label %bb209, label %bb30
 
 bb209:                                            ; preds = %bb207
   br label %bb210
 
 bb210:                                            ; preds = %bb209, %bb206
-  br i1 undef, label %bb212, label %bb30
+  br i1 %arg, label %bb212, label %bb30
 
 bb212:                                            ; preds = %bb210
   unreachable
@@ -838,19 +838,19 @@ bb214:                                            ; preds = %bb42
   br label %bb219
 
 bb215:                                            ; preds = %bb197, %bb173, %bb170, %bb141
-  br i1 undef, label %bb217, label %bb216
+  br i1 %arg, label %bb217, label %bb216
 
 bb216:                                            ; preds = %bb215
   br label %bb217
 
 bb217:                                            ; preds = %bb216, %bb215
-  br i1 undef, label %bb219, label %bb218
+  br i1 %arg, label %bb219, label %bb218
 
 bb218:                                            ; preds = %bb217
   br label %bb219
 
 bb219:                                            ; preds = %bb218, %bb217, %bb214
-  br i1 undef, label %bb221, label %bb220
+  br i1 %arg, label %bb221, label %bb220
 
 bb220:                                            ; preds = %bb219
   unreachable
@@ -863,7 +863,7 @@ bb222:                                            ; preds = %bb20, %bb19
 }
 
 ; Function Attrs: ssp
-define void @test3() #1 personality ptr @__gxx_personality_sj0 {
+define void @test3(i1 %arg) #1 personality ptr @__gxx_personality_sj0 {
 entry:
   %call2 = invoke ptr @objc_msgSend(ptr undef, ptr undef, ptr @_unnamed_cfstring)
           to label %invoke.cont unwind label %lpad
@@ -873,7 +873,7 @@ invoke.cont:                                      ; preds = %entry
           to label %invoke.cont4 unwind label %lpad3
 
 invoke.cont4:                                     ; preds = %invoke.cont
-  br i1 undef, label %land.end, label %land.rhs
+  br i1 %arg, label %land.end, label %land.rhs
 
 land.rhs:                                         ; preds = %invoke.cont4
   %call7 = invoke i32 @objc_msgSend(ptr undef, ptr undef)
@@ -884,7 +884,7 @@ land.end:                                         ; preds = %land.rhs, %invoke.c
           to label %invoke.cont.i unwind label %lpad.i
 
 invoke.cont.i:                                    ; preds = %land.end
-  br i1 undef, label %invoke.cont8, label %if.then.i
+  br i1 %arg, label %invoke.cont8, label %if.then.i
 
 if.then.i:                                        ; preds = %invoke.cont.i
   br label %invoke.cont8
@@ -907,7 +907,7 @@ invoke.cont21:                                    ; preds = %invoke.cont17
           to label %invoke.cont.i1980 unwind label %lpad.i1982
 
 invoke.cont.i1980:                                ; preds = %invoke.cont21
-  br i1 undef, label %invoke.cont24, label %if.then.i1981
+  br i1 %arg, label %invoke.cont24, label %if.then.i1981
 
 if.then.i1981:                                    ; preds = %invoke.cont.i1980
   br label %invoke.cont24
@@ -922,7 +922,7 @@ invoke.cont24:                                    ; preds = %if.then.i1981, %inv
           to label %invoke.cont36 unwind label %lpad35
 
 invoke.cont36:                                    ; preds = %invoke.cont24
-  br i1 undef, label %land.end43, label %land.rhs39
+  br i1 %arg, label %land.end43, label %land.rhs39
 
 land.rhs39:                                       ; preds = %invoke.cont36
   %call41 = invoke signext i8 @objc_msgSend(ptr undef, ptr undef, ptr @_unnamed_cfstring)
@@ -933,7 +933,7 @@ land.end43:                                       ; preds = %land.rhs39, %invoke
           to label %invoke.cont.i1986 unwind label %lpad.i1988
 
 invoke.cont.i1986:                                ; preds = %land.end43
-  br i1 undef, label %invoke.cont44, label %if.then.i1987
+  br i1 %arg, label %invoke.cont44, label %if.then.i1987
 
 if.then.i1987:                                    ; preds = %invoke.cont.i1986
   br label %invoke.cont44
@@ -948,7 +948,7 @@ invoke.cont44:                                    ; preds = %if.then.i1987, %inv
           to label %invoke.cont52 unwind label %lpad51
 
 invoke.cont52:                                    ; preds = %invoke.cont44
-  br i1 undef, label %land.end70, label %land.rhs58
+  br i1 %arg, label %land.end70, label %land.rhs58
 
 land.rhs58:                                       ; preds = %invoke.cont52
   %call63 = invoke ptr @objc_msgSend(ptr undef, ptr undef, i32 42)
@@ -963,7 +963,7 @@ land.end70:                                       ; preds = %invoke.cont62, %inv
           to label %invoke.cont.i1992 unwind label %lpad66.body
 
 invoke.cont.i1992:                                ; preds = %land.end70
-  br i1 undef, label %invoke.cont71, label %if.then.i1993
+  br i1 %arg, label %invoke.cont71, label %if.then.i1993
 
 if.then.i1993:                                    ; preds = %invoke.cont.i1992
   br label %invoke.cont71
@@ -973,7 +973,7 @@ invoke.cont71:                                    ; preds = %if.then.i1993, %inv
           to label %invoke.cont.i1998 unwind label %lpad.i2000
 
 invoke.cont.i1998:                                ; preds = %invoke.cont71
-  br i1 undef, label %invoke.cont91, label %if.then.i1999
+  br i1 %arg, label %invoke.cont91, label %if.then.i1999
 
 if.then.i1999:                                    ; preds = %invoke.cont.i1998
   br label %invoke.cont91
@@ -996,7 +996,7 @@ invoke.cont97:                                    ; preds = %invoke.cont95
           to label %invoke.cont.i2004 unwind label %lpad.i2006
 
 invoke.cont.i2004:                                ; preds = %invoke.cont97
-  br i1 undef, label %invoke.cont100, label %if.then.i2005
+  br i1 %arg, label %invoke.cont100, label %if.then.i2005
 
 if.then.i2005:                                    ; preds = %invoke.cont.i2004
   br label %invoke.cont100
@@ -1015,7 +1015,7 @@ invoke.cont110:                                   ; preds = %invoke.cont100
           to label %invoke.cont.i2010 unwind label %lpad.i2012
 
 invoke.cont.i2010:                                ; preds = %invoke.cont110
-  br i1 undef, label %invoke.cont117, label %if.then.i2011
+  br i1 %arg, label %invoke.cont117, label %if.then.i2011
 
 if.then.i2011:                                    ; preds = %invoke.cont.i2010
   br label %invoke.cont117
@@ -1094,7 +1094,7 @@ lpad109:                                          ; preds = %invoke.cont100
   unreachable
 
 invoke.cont.i2022:                                ; preds = %invoke.cont117
-  br i1 undef, label %invoke.cont157, label %if.then.i2023
+  br i1 %arg, label %invoke.cont157, label %if.then.i2023
 
 if.then.i2023:                                    ; preds = %invoke.cont.i2022
   br label %invoke.cont157
@@ -1104,7 +1104,7 @@ invoke.cont157:                                   ; preds = %if.then.i2023, %inv
           to label %invoke.cont.i2028 unwind label %lpad164.body
 
 invoke.cont.i2028:                                ; preds = %invoke.cont157
-  br i1 undef, label %invoke.cont165, label %if.then.i2029
+  br i1 %arg, label %invoke.cont165, label %if.then.i2029
 
 if.then.i2029:                                    ; preds = %invoke.cont.i2028
   br label %invoke.cont165
@@ -1122,7 +1122,7 @@ invoke.cont185:                                   ; preds = %invoke.cont184
           to label %invoke.cont.i2034 unwind label %lpad.i2036
 
 invoke.cont.i2034:                                ; preds = %invoke.cont185
-  br i1 undef, label %invoke.cont190, label %if.then.i2035
+  br i1 %arg, label %invoke.cont190, label %if.then.i2035
 
 if.then.i2035:                                    ; preds = %invoke.cont.i2034
   br label %invoke.cont190
@@ -1149,7 +1149,7 @@ invoke.cont204:                                   ; preds = %invoke.cont201
           to label %invoke.cont.i2040 unwind label %lpad.i2042
 
 invoke.cont.i2040:                                ; preds = %invoke.cont204
-  br i1 undef, label %invoke.cont207, label %if.then.i2041
+  br i1 %arg, label %invoke.cont207, label %if.then.i2041
 
 if.then.i2041:                                    ; preds = %invoke.cont.i2040
   br label %invoke.cont207
@@ -1168,7 +1168,7 @@ invoke.cont208:                                   ; preds = %invoke.cont207
           to label %invoke.cont.i2046 unwind label %lpad212.body
 
 invoke.cont.i2046:                                ; preds = %invoke.cont208
-  br i1 undef, label %invoke.cont213, label %if.then.i2047
+  br i1 %arg, label %invoke.cont213, label %if.then.i2047
 
 if.then.i2047:                                    ; preds = %invoke.cont.i2046
   br label %invoke.cont213
@@ -1186,7 +1186,7 @@ invoke.cont228:                                   ; preds = %invoke.cont221
           to label %invoke.cont.i2052 unwind label %lpad.i2054
 
 invoke.cont.i2052:                                ; preds = %invoke.cont228
-  br i1 undef, label %invoke.cont231, label %if.then.i2053
+  br i1 %arg, label %invoke.cont231, label %if.then.i2053
 
 if.then.i2053:                                    ; preds = %invoke.cont.i2052
   br label %invoke.cont231
@@ -1205,7 +1205,7 @@ invoke.cont232:                                   ; preds = %invoke.cont231
           to label %invoke.cont.i2058 unwind label %lpad236.body
 
 invoke.cont.i2058:                                ; preds = %invoke.cont232
-  br i1 undef, label %invoke.cont237, label %if.then.i2059
+  br i1 %arg, label %invoke.cont237, label %if.then.i2059
 
 if.then.i2059:                                    ; preds = %invoke.cont.i2058
   br label %invoke.cont237
@@ -1251,7 +1251,7 @@ invoke.cont278:                                   ; preds = %invoke.cont274
           to label %invoke.cont.i2064 unwind label %lpad.i2066
 
 invoke.cont.i2064:                                ; preds = %invoke.cont278
-  br i1 undef, label %invoke.cont281, label %if.then.i2065
+  br i1 %arg, label %invoke.cont281, label %if.then.i2065
 
 if.then.i2065:                                    ; preds = %invoke.cont.i2064
   br label %invoke.cont281
@@ -1286,7 +1286,7 @@ invoke.cont315:                                   ; preds = %invoke.cont312
           to label %invoke.cont321 unwind label %lpad320
 
 invoke.cont321:                                   ; preds = %invoke.cont315
-  br i1 undef, label %land.end344, label %land.rhs335
+  br i1 %arg, label %land.end344, label %land.rhs335
 
 land.rhs335:                                      ; preds = %invoke.cont321
   %call342 = invoke signext i8 @objc_msgSend(ptr undef, ptr undef, ptr @_unnamed_cfstring)
@@ -1297,7 +1297,7 @@ land.end344:                                      ; preds = %land.rhs335, %invok
           to label %invoke.cont.i2070 unwind label %lpad340.body
 
 invoke.cont.i2070:                                ; preds = %land.end344
-  br i1 undef, label %invoke.cont345, label %if.then.i2071
+  br i1 %arg, label %invoke.cont345, label %if.then.i2071
 
 if.then.i2071:                                    ; preds = %invoke.cont.i2070
   br label %invoke.cont345
@@ -1319,7 +1319,7 @@ invoke.cont370:                                   ; preds = %invoke.cont364
           to label %invoke.cont.i2076 unwind label %lpad.i2078
 
 invoke.cont.i2076:                                ; preds = %invoke.cont370
-  br i1 undef, label %invoke.cont373, label %if.then.i2077
+  br i1 %arg, label %invoke.cont373, label %if.then.i2077
 
 if.then.i2077:                                    ; preds = %invoke.cont.i2076
   br label %invoke.cont373
@@ -1346,7 +1346,7 @@ invoke.cont383:                                   ; preds = %invoke.cont382
           to label %invoke.cont.i2082 unwind label %lpad.i2084
 
 invoke.cont.i2082:                                ; preds = %invoke.cont383
-  br i1 undef, label %invoke.cont392, label %if.then.i2083
+  br i1 %arg, label %invoke.cont392, label %if.then.i2083
 
 if.then.i2083:                                    ; preds = %invoke.cont.i2082
   br label %invoke.cont392
@@ -1377,7 +1377,7 @@ invoke.cont402:                                   ; preds = %invoke.cont399
           to label %invoke.cont.i2088 unwind label %lpad.i2090
 
 invoke.cont.i2088:                                ; preds = %invoke.cont402
-  br i1 undef, label %invoke.cont405, label %if.then.i2089
+  br i1 %arg, label %invoke.cont405, label %if.then.i2089
 
 if.then.i2089:                                    ; preds = %invoke.cont.i2088
   br label %invoke.cont405
@@ -1404,7 +1404,7 @@ invoke.cont412:                                   ; preds = %invoke.cont409
           to label %invoke.cont.i2094 unwind label %lpad.i2096
 
 invoke.cont.i2094:                                ; preds = %invoke.cont412
-  br i1 undef, label %invoke.cont418, label %if.then.i2095
+  br i1 %arg, label %invoke.cont418, label %if.then.i2095
 
 if.then.i2095:                                    ; preds = %invoke.cont.i2094
   br label %invoke.cont418
@@ -1435,7 +1435,7 @@ invoke.cont429:                                   ; preds = %invoke.cont426
           to label %invoke.cont.i2100 unwind label %lpad.i2102
 
 invoke.cont.i2100:                                ; preds = %invoke.cont429
-  br i1 undef, label %invoke.cont432, label %if.then.i2101
+  br i1 %arg, label %invoke.cont432, label %if.then.i2101
 
 if.then.i2101:                                    ; preds = %invoke.cont.i2100
   br label %invoke.cont432
@@ -1467,7 +1467,7 @@ invoke.cont443:                                   ; preds = %invoke.cont.i2106
           to label %invoke.cont.i2112 unwind label %lpad.i2114
 
 invoke.cont.i2112:                                ; preds = %invoke.cont443
-  br i1 undef, label %invoke.cont449, label %if.then.i2113
+  br i1 %arg, label %invoke.cont449, label %if.then.i2113
 
 if.then.i2113:                                    ; preds = %invoke.cont.i2112
   br label %invoke.cont449
@@ -1490,7 +1490,7 @@ invoke.cont455:                                   ; preds = %invoke.cont452
           to label %invoke.cont.i2118 unwind label %lpad.i2120
 
 invoke.cont.i2118:                                ; preds = %invoke.cont455
-  br i1 undef, label %invoke.cont458, label %if.then.i2119
+  br i1 %arg, label %invoke.cont458, label %if.then.i2119
 
 if.then.i2119:                                    ; preds = %invoke.cont.i2118
   br label %invoke.cont458
@@ -1509,7 +1509,7 @@ invoke.cont460:                                   ; preds = %invoke.cont458
           to label %invoke.cont.i2124 unwind label %lpad.i2126
 
 invoke.cont.i2124:                                ; preds = %invoke.cont460
-  br i1 undef, label %invoke.cont466, label %if.then.i2125
+  br i1 %arg, label %invoke.cont466, label %if.then.i2125
 
 if.then.i2125:                                    ; preds = %invoke.cont.i2124
   br label %invoke.cont466
@@ -1528,7 +1528,7 @@ invoke.cont469:                                   ; preds = %invoke.cont466
           to label %invoke.cont.i2130 unwind label %lpad.i2132
 
 invoke.cont.i2130:                                ; preds = %invoke.cont469
-  br i1 undef, label %invoke.cont475, label %if.then.i2131
+  br i1 %arg, label %invoke.cont475, label %if.then.i2131
 
 if.then.i2131:                                    ; preds = %invoke.cont.i2130
   br label %invoke.cont475
@@ -1563,7 +1563,7 @@ invoke.cont509:                                   ; preds = %invoke.cont506
           to label %invoke.cont512 unwind label %lpad489
 
 invoke.cont512:                                   ; preds = %invoke.cont509
-  br i1 undef, label %msgSend.null-receiver, label %msgSend.call
+  br i1 %arg, label %msgSend.null-receiver, label %msgSend.call
 
 msgSend.call:                                     ; preds = %invoke.cont512
   invoke void @objc_msgSend_stret(ptr sret(%struct.CGPoint) undef, ptr undef, ptr undef)
@@ -1577,7 +1577,7 @@ msgSend.cont:                                     ; preds = %msgSend.null-receiv
           to label %invoke.cont.i2136 unwind label %lpad.i2138
 
 invoke.cont.i2136:                                ; preds = %msgSend.cont
-  br i1 undef, label %invoke.cont521, label %if.then.i2137
+  br i1 %arg, label %invoke.cont521, label %if.then.i2137
 
 if.then.i2137:                                    ; preds = %invoke.cont.i2136
   br label %invoke.cont521
@@ -1604,7 +1604,7 @@ invoke.cont534:                                   ; preds = %invoke.cont531
           to label %invoke.cont.i2142 unwind label %lpad.i2144
 
 invoke.cont.i2142:                                ; preds = %invoke.cont534
-  br i1 undef, label %invoke.cont540, label %if.then.i2143
+  br i1 %arg, label %invoke.cont540, label %if.then.i2143
 
 if.then.i2143:                                    ; preds = %invoke.cont.i2142
   br label %invoke.cont540
@@ -1918,31 +1918,31 @@ eh.resume:                                        ; preds = %lpad580, %ehcleanup
 
 @"OBJC_EHTYPE_$_NSException" = external global i8
 
-define void @test4() personality ptr @__objc_personality_v0 {
+define void @test4(i1 %arg) personality ptr @__objc_personality_v0 {
 entry:
-  br i1 undef, label %if.end13, label %if.then10
+  br i1 %arg, label %if.end13, label %if.then10
 
 if.then10:                                        ; preds = %entry
   br label %if.end13
 
 if.end13:                                         ; preds = %if.then10, %entry
   %0 = call ptr @objc_msgSend(ptr undef, ptr undef, ptr @_unnamed_cfstring, i64 2, ptr @_unnamed_cfstring_2, i8 signext 0), !clang.arc.no_objc_arc_exceptions !0
-  br i1 undef, label %if.then17, label %if.end18
+  br i1 %arg, label %if.then17, label %if.end18
 
 if.then17:                                        ; preds = %if.end13
   br label %if.end18
 
 if.end18:                                         ; preds = %if.then17, %if.end13
-  br i1 undef, label %if.then64, label %if.end73
+  br i1 %arg, label %if.then64, label %if.end73
 
 if.then64:                                        ; preds = %if.end18
-  br i1 undef, label %cond.end71, label %cond.true68
+  br i1 %arg, label %cond.end71, label %cond.true68
 
 cond.true68:                                      ; preds = %if.then64
   br label %cond.end71
 
 cond.end71:                                       ; preds = %cond.true68, %if.then64
-  br i1 undef, label %cleanup.action, label %cleanup.done
+  br i1 %arg, label %cleanup.action, label %cleanup.done
 
 cleanup.action:                                   ; preds = %cond.end71
   br label %cleanup.done
@@ -1951,7 +1951,7 @@ cleanup.done:                                     ; preds = %cleanup.action, %co
   br label %if.end73
 
 if.end73:                                         ; preds = %cleanup.done, %if.end18
-  br i1 undef, label %forcoll.empty, label %forcoll.loopinit
+  br i1 %arg, label %forcoll.empty, label %forcoll.loopinit
 
 forcoll.loopinit:                                 ; preds = %if.end73
   br label %forcoll.loopbody.outer
@@ -1960,34 +1960,34 @@ forcoll.loopbody.outer:                           ; preds = %forcoll.refetch, %f
   br label %forcoll.loopbody
 
 forcoll.loopbody:                                 ; preds = %forcoll.notmutated, %forcoll.loopbody.outer
-  br i1 undef, label %forcoll.notmutated, label %forcoll.mutated
+  br i1 %arg, label %forcoll.notmutated, label %forcoll.mutated
 
 forcoll.mutated:                                  ; preds = %forcoll.loopbody
   br label %forcoll.notmutated
 
 forcoll.notmutated:                               ; preds = %forcoll.mutated, %forcoll.loopbody
-  br i1 undef, label %forcoll.loopbody, label %forcoll.refetch
+  br i1 %arg, label %forcoll.loopbody, label %forcoll.refetch
 
 forcoll.refetch:                                  ; preds = %forcoll.notmutated
-  br i1 undef, label %forcoll.empty, label %forcoll.loopbody.outer
+  br i1 %arg, label %forcoll.empty, label %forcoll.loopbody.outer
 
 forcoll.empty:                                    ; preds = %forcoll.refetch, %if.end73
-  br i1 undef, label %if.end85, label %if.then82
+  br i1 %arg, label %if.end85, label %if.then82
 
 if.then82:                                        ; preds = %forcoll.empty
   br label %if.end85
 
 if.end85:                                         ; preds = %if.then82, %forcoll.empty
-  br i1 undef, label %if.then87, label %if.end102
+  br i1 %arg, label %if.then87, label %if.end102
 
 if.then87:                                        ; preds = %if.end85
-  br i1 undef, label %if.end94, label %if.then91
+  br i1 %arg, label %if.end94, label %if.then91
 
 if.then91:                                        ; preds = %if.then87
   br label %if.end94
 
 if.end94:                                         ; preds = %if.then91, %if.then87
-  br i1 undef, label %if.end101, label %if.then98
+  br i1 %arg, label %if.end101, label %if.then98
 
 if.then98:                                        ; preds = %if.end94
   br label %if.end101
@@ -1996,139 +1996,139 @@ if.end101:                                        ; preds = %if.then98, %if.end9
   br label %if.end102
 
 if.end102:                                        ; preds = %if.end101, %if.end85
-  br i1 undef, label %do.body113, label %if.then107
+  br i1 %arg, label %do.body113, label %if.then107
 
 if.then107:                                       ; preds = %if.end102
   br label %do.body113
 
 do.body113:                                       ; preds = %if.then107, %if.end102
-  br i1 undef, label %if.then116, label %if.end117
+  br i1 %arg, label %if.then116, label %if.end117
 
 if.then116:                                       ; preds = %do.body113
   br label %if.end117
 
 if.end117:                                        ; preds = %if.then116, %do.body113
-  br i1 undef, label %if.then125, label %if.end126
+  br i1 %arg, label %if.then125, label %if.end126
 
 if.then125:                                       ; preds = %if.end117
   br label %if.end126
 
 if.end126:                                        ; preds = %if.then125, %if.end117
-  br i1 undef, label %do.end166, label %cond.true132
+  br i1 %arg, label %do.end166, label %cond.true132
 
 cond.true132:                                     ; preds = %if.end126
-  br i1 undef, label %do.body148, label %cond.true151
+  br i1 %arg, label %do.body148, label %cond.true151
 
 do.body148:                                       ; preds = %cond.true132
-  br i1 undef, label %do.end166, label %cond.true151
+  br i1 %arg, label %do.end166, label %cond.true151
 
 cond.true151:                                     ; preds = %do.body148, %cond.true132
-  br i1 undef, label %if.then162, label %do.end166
+  br i1 %arg, label %if.then162, label %do.end166
 
 if.then162:                                       ; preds = %cond.true151
   br label %do.end166
 
 do.end166:                                        ; preds = %if.then162, %cond.true151, %do.body148, %if.end126
-  br i1 undef, label %if.then304, label %if.then170
+  br i1 %arg, label %if.then304, label %if.then170
 
 if.then170:                                       ; preds = %do.end166
-  br i1 undef, label %do.end193, label %cond.true179
+  br i1 %arg, label %do.end193, label %cond.true179
 
 cond.true179:                                     ; preds = %if.then170
-  br i1 undef, label %if.then190, label %do.end193
+  br i1 %arg, label %if.then190, label %do.end193
 
 if.then190:                                       ; preds = %cond.true179
   br label %do.end193
 
 do.end193:                                        ; preds = %if.then190, %cond.true179, %if.then170
-  br i1 undef, label %do.body200, label %do.body283
+  br i1 %arg, label %do.body200, label %do.body283
 
 do.body200:                                       ; preds = %do.end193
-  br i1 undef, label %do.end254, label %cond.true203
+  br i1 %arg, label %do.end254, label %cond.true203
 
 cond.true203:                                     ; preds = %do.body200
-  br i1 undef, label %do.body218, label %cond.true221
+  br i1 %arg, label %do.body218, label %cond.true221
 
 do.body218:                                       ; preds = %cond.true203
-  br i1 undef, label %do.end254, label %cond.true221
+  br i1 %arg, label %do.end254, label %cond.true221
 
 cond.true221:                                     ; preds = %do.body218, %cond.true203
-  br i1 undef, label %if.then232, label %do.body236
+  br i1 %arg, label %if.then232, label %do.body236
 
 if.then232:                                       ; preds = %cond.true221
   br label %do.body236
 
 do.body236:                                       ; preds = %if.then232, %cond.true221
-  br i1 undef, label %do.end254, label %cond.true239
+  br i1 %arg, label %do.end254, label %cond.true239
 
 cond.true239:                                     ; preds = %do.body236
-  br i1 undef, label %if.then250, label %do.end254
+  br i1 %arg, label %if.then250, label %do.end254
 
 if.then250:                                       ; preds = %cond.true239
   br label %do.end254
 
 do.end254:                                        ; preds = %if.then250, %cond.true239, %do.body236, %do.body218, %do.body200
-  br i1 undef, label %do.end277, label %cond.true263
+  br i1 %arg, label %do.end277, label %cond.true263
 
 cond.true263:                                     ; preds = %do.end254
-  br i1 undef, label %if.then274, label %do.end277
+  br i1 %arg, label %if.then274, label %do.end277
 
 if.then274:                                       ; preds = %cond.true263
   unreachable
 
 do.end277:                                        ; preds = %cond.true263, %do.end254
-  br i1 undef, label %if.then280, label %do.body283
+  br i1 %arg, label %if.then280, label %do.body283
 
 if.then280:                                       ; preds = %do.end277
   br label %do.body283
 
 do.body283:                                       ; preds = %if.then280, %do.end277, %do.end193
-  br i1 undef, label %if.end301, label %cond.true286
+  br i1 %arg, label %if.end301, label %cond.true286
 
 cond.true286:                                     ; preds = %do.body283
-  br i1 undef, label %if.then297, label %if.end301
+  br i1 %arg, label %if.then297, label %if.end301
 
 if.then297:                                       ; preds = %cond.true286
   br label %if.end301
 
 if.end301:                                        ; preds = %if.then297, %cond.true286, %do.body283
-  br i1 undef, label %if.then304, label %do.body351
+  br i1 %arg, label %if.then304, label %do.body351
 
 if.then304:                                       ; preds = %if.end301, %do.end166
-  br i1 undef, label %do.body309.lr.ph, label %do.body351
+  br i1 %arg, label %do.body309.lr.ph, label %do.body351
 
 do.body309.lr.ph:                                 ; preds = %if.then304
   br label %do.body309
 
 do.body309:                                       ; preds = %for.cond.backedge, %do.body309.lr.ph
-  br i1 undef, label %do.end328, label %cond.true312
+  br i1 %arg, label %do.end328, label %cond.true312
 
 cond.true312:                                     ; preds = %do.body309
-  br i1 undef, label %if.then323, label %do.end328
+  br i1 %arg, label %if.then323, label %do.end328
 
 if.then323:                                       ; preds = %cond.true312
   br label %do.end328
 
 do.end328:                                        ; preds = %if.then323, %cond.true312, %do.body309
-  br i1 undef, label %for.cond.backedge, label %cond.true335
+  br i1 %arg, label %for.cond.backedge, label %cond.true335
 
 for.cond.backedge:                                ; preds = %if.then346, %cond.true335, %do.end328
-  br i1 undef, label %do.body309, label %do.body351
+  br i1 %arg, label %do.body309, label %do.body351
 
 cond.true335:                                     ; preds = %do.end328
-  br i1 undef, label %if.then346, label %for.cond.backedge
+  br i1 %arg, label %if.then346, label %for.cond.backedge
 
 if.then346:                                       ; preds = %cond.true335
   br label %for.cond.backedge
 
 do.body351:                                       ; preds = %for.cond.backedge, %if.then304, %if.end301
-  br i1 undef, label %if.then354, label %if.end355
+  br i1 %arg, label %if.then354, label %if.end355
 
 if.then354:                                       ; preds = %do.body351
   br label %if.end355
 
 if.end355:                                        ; preds = %if.then354, %do.body351
-  br i1 undef, label %if.else, label %if.then364
+  br i1 %arg, label %if.else, label %if.then364
 
 if.then364:                                       ; preds = %if.end355
   br label %do.body366
@@ -2137,7 +2137,7 @@ if.else:                                          ; preds = %if.end355
   br label %do.body366
 
 do.body366:                                       ; preds = %if.else, %if.then364
-  br i1 undef, label %if.then369, label %if.end377.critedge
+  br i1 %arg, label %if.then369, label %if.end377.critedge
 
 if.then369:                                       ; preds = %do.body366
   br label %if.end377
@@ -2146,7 +2146,7 @@ if.end377.critedge:                               ; preds = %do.body366
   br label %if.end377
 
 if.end377:                                        ; preds = %if.end377.critedge, %if.then369
-  br i1 undef, label %if.then383, label %if.end392.critedge
+  br i1 %arg, label %if.then383, label %if.end392.critedge
 
 if.then383:                                       ; preds = %if.end377
   br label %if.end392
@@ -2155,7 +2155,7 @@ if.end392.critedge:                               ; preds = %if.end377
   br label %if.end392
 
 if.end392:                                        ; preds = %if.end392.critedge, %if.then383
-  br i1 undef, label %if.then398, label %if.end399
+  br i1 %arg, label %if.then398, label %if.end399
 
 if.then398:                                       ; preds = %if.end392
   br label %if.end399
@@ -2165,7 +2165,7 @@ if.end399:                                        ; preds = %if.then398, %if.end
           to label %eh.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
 
 eh.cont:                                          ; preds = %if.end399
-  br i1 undef, label %if.then430, label %if.end439.critedge
+  br i1 %arg, label %if.then430, label %if.end439.critedge
 
 if.then430:                                       ; preds = %eh.cont
   %1 = call ptr @llvm.objc.retain(ptr %0)
diff --git a/llvm/test/Transforms/PGOProfile/loop_entries_gen.ll b/llvm/test/Transforms/PGOProfile/loop_entries_gen.ll
new file mode 100644
index 0000000000000..ed101271558c6
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/loop_entries_gen.ll
@@ -0,0 +1,58 @@
+; RUN: opt %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefixes=CHECK,NOTLOOPENTRIES --implicit-check-not=@llvm.instrprof.increment
+; RUN: opt %s -passes=pgo-instr-gen -pgo-instrument-loop-entries -S | FileCheck %s --check-prefixes=CHECK,LOOPENTRIES --implicit-check-not=@llvm.instrprof.increment
+; RUN: opt %s -passes=pgo-instr-gen -pgo-instrument-entry -S | FileCheck %s --check-prefixes=CHECK,FUNCTIONENTRY --implicit-check-not=@llvm.instrprof.increment
+
+; CHECK: $__llvm_profile_raw_version = comdat any
+; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
+; CHECK: @__profn_test_simple_for_with_bypass = private constant [27 x i8] c"test_simple_for_with_bypass"
+
+define i32 @test_simple_for_with_bypass(i32 %n) {
+entry:
+; CHECK: entry:
+; NOTLOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 1)
+; LOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 1)
+; FUNCTIONENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 0)
+  %mask = and i32 %n, 65535
+  %skip = icmp eq i32 %mask, 0
+  br i1 %skip, label %end, label %for.entry
+
+for.entry:
+; CHECK: for.entry:
+; LOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 2)
+  br label %for.cond
+
+for.cond:
+; CHECK: for.cond:
+  %i = phi i32 [ 0, %for.entry ], [ %inc1, %for.inc ]
+  %sum = phi i32 [ 1, %for.entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+; CHECK: for.body:
+  %inc = add nsw i32 %sum, 1
+  br label %for.inc
+
+for.inc:
+; CHECK: for.inc:
+; NOTLOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 0)
+; LOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 0)
+; FUNCTIONENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 1)
+  %inc1 = add nsw i32 %i, 1
+  br label %for.cond
+
+for.end:
+; CHECK: for.end:
+; NOTLOOPENTRIES: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 2)
+; FUNCTIONENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_simple_for_with_bypass, i64 {{[0-9]+}}, i32 3, i32 2)
+  br label %end
+
+end:
+; CHECK: end:
+  %final_sum = phi i32 [ %sum, %for.end ], [ 0, %entry ]
+  ret i32 %final_sum
+}
+
+; CHECK: declare void @llvm.instrprof.increment(ptr, i64, i32, i32) #0
+
+!1 = !{!"branch_weights", i32 100000, i32 80}
diff --git a/llvm/test/Transforms/PGOProfile/loop_entries_use.ll b/llvm/test/Transforms/PGOProfile/loop_entries_use.ll
new file mode 100644
index 0000000000000..616ecbaf439c3
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/loop_entries_use.ll
@@ -0,0 +1,106 @@
+; RUN: rm -rf %t && split-file %s %t
+
+; RUN: llvm-profdata merge %t/default.proftext -o %t/default.profdata
+; RUN: opt %t/main.ll -passes=pgo-instr-use -pgo-test-profile-file=%t/default.profdata -S | FileCheck %s
+; RUN: llvm-profdata merge %t/loop_entries.proftext -o %t/loop_entries.profdata
+; RUN: opt %t/main.ll -passes=pgo-instr-use -pgo-test-profile-file=%t/loop_entries.profdata -S | FileCheck %s
+; RUN: llvm-profdata merge %t/function_entry.proftext -o %t/function_entry.profdata
+; RUN: opt %t/main.ll -passes=pgo-instr-use -pgo-test-profile-file=%t/function_entry.profdata -S | FileCheck %s
+
+;--- main.ll
+
+define i32 @test_simple_for_with_bypass(i32 %n) {
+; CHECK: define i32 @test_simple_for_with_bypass(i32 %n)
+; CHECK-SAME: !prof ![[ENTRY_COUNT:[0-9]*]]
+entry:
+; CHECK: entry:
+  %mask = and i32 %n, 65535
+  %skip = icmp eq i32 %mask, 0
+  br i1 %skip, label %end, label %for.entry
+; CHECK: br i1 %skip, label %end, label %for.entry
+; CHECK-SAME: !prof ![[BW_FOR_BYPASS:[0-9]+]]
+
+for.entry:
+; CHECK: for.entry:
+  br label %for.cond
+
+for.cond:
+; CHECK: for.cond:
+  %i = phi i32 [ 0, %for.entry ], [ %inc1, %for.inc ]
+  %sum = phi i32 [ 1, %for.entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %for.end, !prof !1
+; CHECK: br i1 %cmp, label %for.body, label %for.end
+; CHECK-SAME: !prof ![[BW_FOR_COND:[0-9]+]]
+
+for.body:
+; CHECK: for.body:
+  %inc = add nsw i32 %sum, 1
+  br label %for.inc
+
+for.inc:
+; CHECK: for.inc:
+  %inc1 = add nsw i32 %i, 1
+  br label %for.cond
+
+for.end:
+; CHECK: for.end:
+  br label %end
+
+end:
+; CHECK: end:
+  %final_sum = phi i32 [ %sum, %for.end ], [ 0, %entry ]
+  ret i32 %final_sum
+}
+
+!1 = !{!"branch_weights", i32 100000, i32 80}
+
+; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 12}
+; CHECK: ![[BW_FOR_BYPASS]] = !{!"branch_weights", i32 4, i32 8}
+; CHECK: ![[BW_FOR_COND]] = !{!"branch_weights", i32 123456, i32 8}
+
+;--- default.proftext
+
+# :ir is the flag to indicate this is IR level profile.
+:ir
+test_simple_for_with_bypass
+# Func Hash:
+536873292337293370
+# Num Counters:
+3
+# Counter Values:
+123456
+12
+8
+
+;--- loop_entries.proftext
+
+# :ir is the flag to indicate this is IR level profile.
+:ir
+# Always instrument the loop entry blocks
+:instrument_loop_entries
+test_simple_for_with_bypass
+# Func Hash:
+536873292337293370
+# Num Counters:
+3
+# Counter Values:
+123456
+12
+8
+
+;--- function_entry.proftext
+
+# :ir is the flag to indicate this is IR level profile.
+:ir
+# Always instrument the function entry block
+:entry_first
+test_simple_for_with_bypass
+# Func Hash:
+536873292337293370
+# Num Counters:
+3
+# Counter Values:
+12
+123456
+8
diff --git a/llvm/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll b/llvm/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll
index 4cde69bb0c7dd..80a9e77030e64 100644
--- a/llvm/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll
+++ b/llvm/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i386-gnu-linux"
 
-define void @exp_averages_intraday__deviation() {
+define void @exp_averages_intraday__deviation(i1 %arg) {
 entry:
   %0 = load i32, ptr undef, align 4
   %1 = shl i32 %0, 2
@@ -16,14 +16,14 @@ entry:
   br i1 false, label %"4", label %"12"
 
 "4":                                              ; preds = %entry
-  br i1 undef, label %"5", label %"8"
+  br i1 %arg, label %"5", label %"8"
 
 "5":                                              ; preds = %"4"
   unreachable
 
 "8":                                              ; preds = %"4"
   %8 = getelementptr inbounds i8, ptr undef, i32 %6
-  br i1 undef, label %"13", label %"12"
+  br i1 %arg, label %"13", label %"12"
 
 "12":                                             ; preds = %"8", %entry
   ret void
diff --git a/llvm/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll b/llvm/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
index 650aa82c1af2f..ba730758a1f26 100644
--- a/llvm/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
+++ b/llvm/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
@@ -1,14 +1,14 @@
 ; RUN: opt < %s -passes=reassociate -disable-output
 ; PR13041
 
-define void @foo() {
+define void @foo(i1 %arg) {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %b.0 = phi i32 [ undef, %entry ], [ %sub2, %while.body ]
   %c.0 = phi i32 [ undef, %entry ], [ %sub3, %while.body ]
-  br i1 undef, label %while.end, label %while.body
+  br i1 %arg, label %while.end, label %while.body
 
 while.body:                                       ; preds = %while.cond
   %sub = sub nsw i32 0, %b.0
diff --git a/llvm/test/Transforms/Reassociate/add_across_block_crash.ll b/llvm/test/Transforms/Reassociate/add_across_block_crash.ll
index 26e971266d344..a89f81154f3eb 100644
--- a/llvm/test/Transforms/Reassociate/add_across_block_crash.ll
+++ b/llvm/test/Transforms/Reassociate/add_across_block_crash.ll
@@ -3,10 +3,10 @@
 ; This test is to make sure while processing a block, uses of instructions
 ; from a different basic block don't get added to be re-optimized
 
-define  void @main() {
+define  void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label %bb1, label %bb2
+; CHECK-NEXT:    br i1 %arg, label %bb1, label %bb2
 ; CHECK:       bb1:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb2:
@@ -14,7 +14,7 @@ define  void @main() {
 ;
 entry:
   %0 = fadd fast float undef, undef
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg, label %bb1, label %bb2
 
 bb1:
   %1 = fmul fast float undef, -2.000000e+00
diff --git a/llvm/test/Transforms/Reassociate/infloop-deadphi.ll b/llvm/test/Transforms/Reassociate/infloop-deadphi.ll
index 5b19f8d384f4d..3202f450b7566 100644
--- a/llvm/test/Transforms/Reassociate/infloop-deadphi.ll
+++ b/llvm/test/Transforms/Reassociate/infloop-deadphi.ll
@@ -3,14 +3,14 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @f() {
+define void @f(i1 %arg) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       dead:
 ; CHECK-NEXT:    [[XOR0:%.*]] = xor i16 [[XOR1:%.*]], undef
 ; CHECK-NEXT:    [[XOR1]] = xor i16 [[XOR0]], undef
-; CHECK-NEXT:    br i1 undef, label [[DEAD:%.*]], label [[DONE]]
+; CHECK-NEXT:    br i1 %arg, label [[DEAD:%.*]], label [[DONE]]
 ; CHECK:       done:
 ; CHECK-NEXT:    ret void
 ;
@@ -20,7 +20,7 @@ entry:
 dead:
   %xor0 = xor i16 %xor1, undef
   %xor1 = xor i16 %xor0, undef
-  br i1 undef, label %dead, label %done
+  br i1 %arg, label %dead, label %done
 
 done:
   %e = phi i16 [ %xor1, %dead ], [ 0, %entry ]
diff --git a/llvm/test/Transforms/Reassociate/reassociate-landingpad.ll b/llvm/test/Transforms/Reassociate/reassociate-landingpad.ll
index eb6a5cabb7be8..c5bd62e2763b6 100644
--- a/llvm/test/Transforms/Reassociate/reassociate-landingpad.ll
+++ b/llvm/test/Transforms/Reassociate/reassociate-landingpad.ll
@@ -14,7 +14,7 @@ declare i32 @__gxx_personality_v0(...)
 
 declare void @b() #0
 
-define void @a() #0 personality ptr @__gxx_personality_v0 {
+define void @a(i1 %arg) #0 personality ptr @__gxx_personality_v0 {
 ", bb1":
   invoke void @b()
           to label %invoke.cont unwind label %"bb22"
@@ -39,7 +39,7 @@ define void @a() #0 personality ptr @__gxx_personality_v0 {
   unreachable
 
 invoke.cont:                                      ; preds = %", bb1"
-  br i1 undef, label %", bb15", label %", bb8"
+  br i1 %arg, label %", bb15", label %", bb8"
 
 invoke.cont25:                                    ; preds = %", bb8"
   unreachable
diff --git a/llvm/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll b/llvm/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
index 2098b0390b64a..974c2dac78ebf 100644
--- a/llvm/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
+++ b/llvm/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=sccp -S | FileCheck %s
 
-; Branch on undef is UB, so the T block is never executed, and we can return
+; Branch on poison is UB, so the T block is never executed, and we can return
 ; undef (IPSCCP would replace the block with unreachable).
 
 define i32 @foo() {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    unreachable
 ;
-  br i1 undef, label %T, label %T
+  br i1 poison, label %T, label %T
 T:
   %X = add i32 0, 1
   ret i32 %X
diff --git a/llvm/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll b/llvm/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll
index 3e2cfe1ce25fb..8c1687a3eb784 100644
--- a/llvm/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll
+++ b/llvm/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll
@@ -6,7 +6,7 @@ target triple = "powerpc-unknown-linux-gnu"
 @JUMP = external global i32		; <ptr> [#uses=1]
 @old_D_pat = external global [16 x i8]		; <ptr> [#uses=0]
 
-define void @asearch1(i32 %D) {
+define void @asearch1(i32 %D, i1 %arg) {
 entry:
 	%tmp80 = icmp ult i32 0, %D		; <i1> [#uses=1]
 	br i1 %tmp80, label %bb647.preheader, label %cond_true81.preheader
@@ -22,7 +22,7 @@ cond_true612:		; preds = %cond_true654
 cond_next624:		; preds = %cond_true654
 	ret void
 cond_true654:		; preds = %bb647.preheader
-	br i1 undef, label %cond_true612, label %cond_next624
+	br i1 %arg, label %cond_true612, label %cond_next624
 UnifiedReturnBlock:		; preds = %bb647.preheader
 	ret void
 }
diff --git a/llvm/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll b/llvm/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll
index 6f499f27a65a9..895b1393dadae 100644
--- a/llvm/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll
+++ b/llvm/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll
@@ -2,12 +2,30 @@
 ; RUN: opt < %s -passes=sccp -S | FileCheck %s
 ; PR1938
 
-define i32 @main() {
+define i32 @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[K:%.*]], [[BB_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[K]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp slt i32 [[K]], 10
+; CHECK-NEXT:    br i1 [[TMP97]], label [[BB_BACKEDGE]], label [[BB12:%.*]]
+; CHECK:       bb.backedge:
+; CHECK-NEXT:    br label [[BB]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[K]], 10
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB_BACKEDGE]], label [[BB12]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[K]], 10
+; CHECK-NEXT:    br i1 [[TMP14]], label [[COND_NEXT18:%.*]], label [[COND_TRUE17:%.*]]
+; CHECK:       cond_true17:
+; CHECK-NEXT:    tail call void @abort()
 ; CHECK-NEXT:    unreachable
+; CHECK:       cond_next18:
+; CHECK-NEXT:    ret i32 0
 ;
 entry:
   br label %bb
@@ -15,7 +33,7 @@ entry:
 bb:
   %indvar = phi i32 [ 0, %entry ], [ %k, %bb.backedge ]
   %k = add i32 %indvar, 1
-  br i1 undef, label %cond_true, label %cond_false
+  br i1 %arg, label %cond_true, label %cond_false
 
 cond_true:
   %tmp97 = icmp slt i32 %k, 10
diff --git a/llvm/test/Transforms/SCCP/PR26044.ll b/llvm/test/Transforms/SCCP/PR26044.ll
index 90ac3101d0c23..f786629f47d0e 100644
--- a/llvm/test/Transforms/SCCP/PR26044.ll
+++ b/llvm/test/Transforms/SCCP/PR26044.ll
@@ -3,13 +3,13 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @fn2(ptr %P) {
+define void @fn2(ptr %P, i1 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@fn2
-; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-SAME: (ptr [[P:%.*]], i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       for.cond1:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br i1 [[ARG]], label [[IF_END]], label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn1(i32 undef)
 ; CHECK-NEXT:    store i32 [[CALL]], ptr [[P]], align 4
@@ -19,7 +19,7 @@ entry:
   br label %if.end
 
 for.cond1:                                        ; preds = %if.end, %for.end
-  br i1 undef, label %if.end, label %if.end
+  br i1 %arg, label %if.end, label %if.end
 
 if.end:                                           ; preds = %lbl, %for.cond1
   %e.2 = phi ptr [ undef, %entry ], [ null, %for.cond1 ], [ null, %for.cond1 ]
@@ -43,15 +43,16 @@ entry:
   ret i32 %cond
 }
 
-define void @fn_no_null_opt(ptr %P) #0 {
+define void @fn_no_null_opt(ptr %P, i1 %arg) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@fn_no_null_opt
-; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[P:%.*]], i1 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       for.cond1:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br i1 [[ARG]], label [[IF_END]], label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn0(i32 undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn0(i32 [[TMP0]])
 ; CHECK-NEXT:    store i32 [[CALL]], ptr [[P]], align 4
 ; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
 ;
@@ -59,7 +60,7 @@ entry:
   br label %if.end
 
 for.cond1:                                        ; preds = %if.end, %for.end
-  br i1 undef, label %if.end, label %if.end
+  br i1 %arg, label %if.end, label %if.end
 
 if.end:                                           ; preds = %lbl, %for.cond1
   %e.2 = phi ptr [ undef, %entry ], [ null, %for.cond1 ], [ null, %for.cond1 ]
@@ -73,8 +74,8 @@ define internal i32 @fn0(i32 %p1) {
 ; CHECK-LABEL: define {{[^@]+}}@fn0
 ; CHECK-SAME: (i32 [[P1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 undef, 0
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[P1]], 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], i32 [[P1]], i32 [[P1]]
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/crash.ll b/llvm/test/Transforms/SCCP/crash.ll
index 47d9329f6f03d..9001b42850c25 100644
--- a/llvm/test/Transforms/SCCP/crash.ll
+++ b/llvm/test/Transforms/SCCP/crash.ll
@@ -2,9 +2,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
 
-define void @test1(i8 %arg) {
+define void @test1(i8 %arg, i1 %arg1) {
 entry:
-  br i1 undef, label %return, label %bb
+  br i1 %arg1, label %return, label %bb
 
 bb:   
   br label %bb34
diff --git a/llvm/test/Transforms/SCCP/domtree-update.ll b/llvm/test/Transforms/SCCP/domtree-update.ll
index 76f575c4e9c6d..270da0e2f2bfe 100644
--- a/llvm/test/Transforms/SCCP/domtree-update.ll
+++ b/llvm/test/Transforms/SCCP/domtree-update.ll
@@ -4,7 +4,7 @@
 
 ; DTU should not crash.
 
-define i32 @test() {
+define i32 @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -25,10 +25,10 @@ if.then2:                                         ; preds = %for.body
   br label %for.inc
 
 if.else:                                          ; preds = %for.body
-  br i1 undef, label %lor.rhs, label %if.then19.critedge
+  br i1 %arg, label %lor.rhs, label %if.then19.critedge
 
 lor.rhs:                                          ; preds = %if.else
-  br i1 undef, label %if.then19, label %for.inc
+  br i1 %arg, label %if.then19, label %for.inc
 
 if.then19.critedge:                               ; preds = %if.else
   br label %if.then19
diff --git a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll
index 8061a0396ee4c..6a8b52d0ac481 100644
--- a/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll
+++ b/llvm/test/Transforms/SCCP/fp-bc-icmp-const-fold.ll
@@ -2,18 +2,18 @@
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
-define void @test(i32 signext %n) {
+define void @test(i32 signext %n, i1 %arg) {
 
 ; CHECK-LABEL: @test
 
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 %arg, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
   ret void
 
 if.end:                                           ; preds = %entry
-  br i1 undef, label %if.then2, label %if.end4
+  br i1 %arg, label %if.then2, label %if.end4
 
 if.then2:                                         ; preds = %if.end
   unreachable
@@ -36,10 +36,10 @@ if.else14:                                        ; preds = %if.end4
 
 do.body:                                          ; preds = %do.body, %if.else14
   %scale.0 = phi ppc_fp128 [ 0xM3FF00000000000000000000000000000, %if.else14 ], [ %scale.0, %do.body ]
-  br i1 undef, label %do.body, label %if.then33
+  br i1 %arg, label %do.body, label %if.then33
 
 if.then33:                                        ; preds = %do.body
-  br i1 undef, label %_ZN5boost4math4signIgEEiRKT_.exit30, label %cond.false.i28
+  br i1 %arg, label %_ZN5boost4math4signIgEEiRKT_.exit30, label %cond.false.i28
 
 cond.false.i28:                                   ; preds = %if.then33
   %0 = bitcast ppc_fp128 %scale.0 to i128
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index f8c8e33dfc233..be05d96f08574 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -20,16 +20,17 @@
 ; CHECK-NEXT:    [2] %for.body {4294967295,4294967295} [1]
 ; CHECK-NEXT:    [2] %if.end4 {4294967295,4294967295} [1]
 ; CHECK-NEXT:      [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT:    [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:      [3] %for.cond16 {4294967295,4294967295} [2]
-; CHECK-NEXT: Roots: %for.body %for.cond34
+; CHECK-NEXT:    [2] %for.body37 {4294967295,4294967295} [1]
+; CHECK-NEXT:      [3] %for.cond34 {4294967295,4294967295} [2]
+; CHECK-NEXT:        [4] %for.cond16 {4294967295,4294967295} [3]
+; CHECK-NEXT: Roots: %for.body %for.body37
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
 
 declare hidden i1 @compare(ptr) align 2
 declare hidden { i8, ptr } @getType(ptr) align 2
 
-define internal void @foo(ptr %TLI, ptr %DL, ptr %Ty, ptr %ValueVTs, ptr %Offsets, i64 %StartingOffset) {
+define internal void @foo(ptr %TLI, ptr %DL, ptr %Ty, ptr %ValueVTs, ptr %Offsets, i64 %StartingOffset, i1 %arg) {
 entry:
   %VT = alloca i64, align 8
   br i1 false, label %if.then, label %if.end4
@@ -51,7 +52,7 @@ for.cond16:                                       ; preds = %for.cond34, %if.end
   br label %for.cond34
 
 for.cond34:                                       ; preds = %for.body37, %for.cond16
-  br i1 undef, label %for.body37, label %for.cond16
+  br i1 %arg, label %for.body37, label %for.cond16
 
 for.body37:                                       ; preds = %for.cond34
   %tobool39 = icmp ne ptr %Offsets, null
diff --git a/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll b/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll
index ed5703da35e6f..1ba6e9734e642 100644
--- a/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll
+++ b/llvm/test/Transforms/SCCP/pr49582-iterator-invalidation.ll
@@ -5,7 +5,7 @@
 @c = external dso_local global ptr, align 8
 @d = external dso_local global i32, align 4
 
-define void @f(i32 %i) {
+define void @f(i32 %i, i1 %arg) {
 entry:
   br label %for.cond
 
@@ -474,7 +474,7 @@ if.then312:                                       ; preds = %if.then309
   br label %if.end628
 
 if.else316:                                       ; preds = %if.then309
-  br i1 undef, label %if.then318, label %if.end628
+  br i1 %arg, label %if.then318, label %if.end628
 
 if.then318:                                       ; preds = %if.else316
   %idxprom320 = sext i32 %add310 to i64
@@ -726,7 +726,7 @@ if.then499:                                       ; preds = %if.else496
   br label %if.end628
 
 if.else501:                                       ; preds = %if.else496
-  br i1 undef, label %if.then503, label %if.end628
+  br i1 %arg, label %if.then503, label %if.end628
 
 if.then503:                                       ; preds = %if.else501
   br label %if.end628
@@ -834,7 +834,7 @@ if.then596:                                       ; preds = %if.then593
   br label %if.end628
 
 if.else600:                                       ; preds = %if.then593
-  br i1 undef, label %if.then602, label %if.end628
+  br i1 %arg, label %if.then602, label %if.end628
 
 if.then602:                                       ; preds = %if.else600
   %idxprom604 = sext i32 %add594 to i64
diff --git a/llvm/test/Transforms/SCCP/return-zapped.ll b/llvm/test/Transforms/SCCP/return-zapped.ll
index 6d70500125093..cafc8aa69e868 100644
--- a/llvm/test/Transforms/SCCP/return-zapped.ll
+++ b/llvm/test/Transforms/SCCP/return-zapped.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
 ; RUN: opt < %s -S -passes=ipsccp | FileCheck %s
 
-; testf() performs an unconditional branch on undef, as such the testf() return
+; testf() performs an unconditional branch on poison, as such the testf() return
 ; value used in test1() will remain "unknown" and the following branch on it
 ; replaced by unreachable. This is fine, as the call to testf() will already
 ; trigger undefined behavior.
@@ -31,7 +31,7 @@ define internal i1 @testf() {
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  br i1 undef, label %if.then1, label %if.end3
+  br i1 poison, label %if.then1, label %if.end3
 
 if.then1:                                         ; preds = %if.end
   br label %if.end3
diff --git a/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll b/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
index 05f3358045c3c..a4b1ba8a0c8dd 100644
--- a/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
+++ b/llvm/test/Transforms/SCCP/solve-after-each-resolving-undefs-for-function.ll
@@ -16,7 +16,7 @@ entry:
   br i1 %c, label %if.cond, label %if.end
 
 if.cond:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry, %if.then
   ret i32 11
diff --git a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
index 2336c9186636e..73eef205adae7 100644
--- a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
+++ b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
@@ -59,13 +59,23 @@ bb38:                                             ; preds = %bb16
 }
 
 
-define void @hoge() {
-; CHECK-LABEL: define {{[^@]+}}@hoge() {
+define void @hoge(i1 %arg, i16 %arg2) {
+; CHECK-LABEL: define {{[^@]+}}@hoge
+; CHECK-SAME: (i1 [[ARG:%.*]], i16 [[ARG2:%.*]]) {
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    switch i16 [[ARG2]], label [[BB1:%.*]] [
+; CHECK-NEXT:      i16 135, label [[BB2:%.*]]
+; CHECK-NEXT:      i16 66, label [[BB2]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb3:
 ; CHECK-NEXT:    unreachable
 ;
 bb:
-  switch i16 undef, label %bb1 [
+  switch i16 %arg2, label %bb1 [
   i16 135, label %bb2
   i16 66, label %bb2
   ]
@@ -89,14 +99,9 @@ bb4:                                              ; preds = %bb2, %bb2, %bb2
 
 ; Test case from PR49573. %default.bb is unfeasible. Make sure it gets replaced
 ; by an unreachable block.
-define void @pr49573_main() {
-; CHECK-LABEL: define {{[^@]+}}@pr49573_main() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TGT:%.*]] = call i16 @pr49573_fn()
-; CHECK-NEXT:    unreachable
-;
+define void @pr49573_main(i1 %arg) {
 entry:
-  %tgt = call i16 @pr49573_fn()
+  %tgt = call i16 @pr49573_fn(i1 %arg)
   switch i16 %tgt, label %default.bb [
   i16 0, label %case.0
   i16 1, label %case.1
@@ -116,7 +121,7 @@ case.2:
   br label %next
 
 next:
-  %tgt.2 = call i16 @pr49573_fn_2()
+  %tgt.2 = call i16 @pr49573_fn_2(i1 %arg)
   switch i16 %tgt.2, label %default.bb [
   i16 0, label %case.0
   i16 2, label %case.2
@@ -124,14 +129,9 @@ next:
 }
 
 ; Make sure a new unreachable BB is created.
-define void @pr49573_main_2() {
-; CHECK-LABEL: define {{[^@]+}}@pr49573_main_2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TGT:%.*]] = call i16 @pr49573_fn()
-; CHECK-NEXT:    unreachable
-;
+define void @pr49573_main_2(i1 %arg) {
 entry:
-  %tgt = call i16 @pr49573_fn()
+  %tgt = call i16 @pr49573_fn(i1 %arg)
   switch i16 %tgt, label %default.bb [
   i16 0, label %case.0
   i16 1, label %case.1
@@ -151,13 +151,18 @@ case.2:
   ret void
 }
 
-define internal i16 @pr49573_fn() {
-; CHECK-LABEL: define {{[^@]+}}@pr49573_fn() {
+define internal i16 @pr49573_fn(i1 %arg) {
+; CHECK-LABEL: define {{[^@]+}}@pr49573_fn
+; CHECK-SAME: (i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br i1 [[ARG]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i16 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i16 2
 ;
 entry:
-  br i1 undef, label %then, label %else
+  br i1 %arg, label %then, label %else
 
 then:
   ret i16 0
@@ -166,13 +171,18 @@ else:
   ret i16 2
 }
 
-define internal i16 @pr49573_fn_2() {
-; CHECK-LABEL: define {{[^@]+}}@pr49573_fn_2() {
+define internal i16 @pr49573_fn_2(i1 %arg) {
+; CHECK-LABEL: define {{[^@]+}}@pr49573_fn_2
+; CHECK-SAME: (i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br i1 [[ARG]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i16 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i16 2
 ;
 entry:
-  br i1 undef, label %then, label %else
+  br i1 %arg, label %then, label %else
 
 then:
   ret i16 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
index 920e1e64e3958..356102ce81780 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
@@ -8,10 +8,10 @@ target triple = "aarch64--linux-gnu"
 ; should not compute a smaller size for %k.13 since it is in a use-def cycle
 ; and cannot be demoted.
 ;
-define fastcc void @PR26364() {
+define fastcc void @PR26364(i1 %arg) {
 ; CHECK-LABEL: @PR26364(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END11:%.*]], label [[FOR_COND4:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END11:%.*]], label [[FOR_COND4:%.*]]
 ; CHECK:       for.cond4:
 ; CHECK-NEXT:    [[K_13:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[K_3:%.*]], [[FOR_COND4]] ]
 ; CHECK-NEXT:    [[E_02:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 0, [[FOR_COND4]] ]
@@ -22,7 +22,7 @@ define fastcc void @PR26364() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %for.end11, label %for.cond4
+  br i1 %arg, label %for.end11, label %for.cond4
 
 for.cond4:
   %k.13 = phi i32 [ undef, %entry ], [ %k.3, %for.cond4 ]
@@ -39,10 +39,10 @@ for.end11:
 ; every root in the vectorizable tree when computing minimum sizes since one
 ; root may require fewer bits than another.
 ;
-define void @PR26629(ptr %c) {
+define void @PR26629(ptr %c, i1 %arg) {
 ; CHECK-LABEL: @PR26629(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[FOR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_PH:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -59,7 +59,7 @@ define void @PR26629(ptr %c) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %for.ph, label %for.end
+  br i1 %arg, label %for.ph, label %for.end
 
 for.ph:
   %0 = load i32, ptr %c, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index dc05967af1529..f5e904467baa7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-w32-windows-gnu | FileCheck %s
 
-define i32 @foo(i32 %v1, double %v2) {
+define i32 @foo(i32 %v1, double %v2, i1 %arg, i32 %arg2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
@@ -15,7 +15,7 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    br label [[FOR_COND15:%.*]]
 ; CHECK:       for.end39:
-; CHECK-NEXT:    switch i32 undef, label [[DO_BODY:%.*]] [
+; CHECK-NEXT:    switch i32 %arg2, label [[DO_BODY:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
 ; CHECK-NEXT:      i32 1, label [[SW_BB195:%.*]]
 ; CHECK-NEXT:    ]
@@ -39,7 +39,7 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
 ; CHECK:       for.cond15.1:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_COND15_PREHEADER]]
 ;
 entry:
   %conv = sitofp i32 undef to double
@@ -56,7 +56,7 @@ if.end:                                           ; preds = %for.cond15.preheade
   br label %for.cond15
 
 for.end39:                                        ; preds = %for.cond15.1
-  switch i32 undef, label %do.body [
+  switch i32 %arg2, label %do.body [
   i32 0, label %sw.bb
   i32 1, label %sw.bb195
   ]
@@ -99,7 +99,7 @@ if.end.1:                                         ; preds = %for.cond15
   br label %for.cond15.1
 
 for.cond15.1:                                     ; preds = %if.end.1
-  br i1 undef, label %for.end39, label %for.cond15.preheader
+  br i1 %arg, label %for.end39, label %for.cond15.preheader
 }
 
 declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
index 9910090d43eae..82761b458efcf 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -3,13 +3,13 @@
 target triple = "aarch64-unknown-linux-gnu"
 @d = internal unnamed_addr global i32 5, align 4
 
-define dso_local void @l() local_unnamed_addr {
+define dso_local void @l(i1 %arg) local_unnamed_addr {
 ; CHECK-LABEL: @l(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP9:%.*]], [[BB25:%.*]] ]
-; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
@@ -34,7 +34,7 @@ define dso_local void @l() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
 ; CHECK-NEXT:    [[I32:%.*]] = and i32 [[I31]], [[TMP13]]
 ; CHECK-NEXT:    [[I33:%.*]] = and i32 [[I32]], [[I28]]
-; CHECK-NEXT:    br i1 undef, label [[BB34:%.*]], label [[BB1]]
+; CHECK-NEXT:    br i1 %arg, label [[BB34:%.*]], label [[BB1]]
 ; CHECK:       bb34:
 ; CHECK-NEXT:    [[I35:%.*]] = phi i32 [ [[I33]], [[BB25]] ]
 ; CHECK-NEXT:    br label [[BB36:%.*]]
@@ -48,7 +48,7 @@ bb:
 bb1:                                              ; preds = %bb25, %bb
   %i = phi i16 [ undef, %bb ], [ %i29, %bb25 ]
   %i2 = phi i16 [ undef, %bb ], [ %i30, %bb25 ]
-  br i1 undef, label %bb3, label %bb11
+  br i1 %arg, label %bb3, label %bb11
 
 bb3:                                              ; preds = %bb1
   %i4 = zext i1 undef to i32
@@ -85,7 +85,7 @@ bb25:                                             ; preds = %bb11, %bb3
   %i31 = and i32 undef, %i26
   %i32 = and i32 %i31, %i27
   %i33 = and i32 %i32, %i28
-  br i1 undef, label %bb34, label %bb1
+  br i1 %arg, label %bb34, label %bb1
 
 bb34:                                             ; preds = %bb25
   %i35 = phi i32 [ %i33, %bb25 ]
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
new file mode 100644
index 0000000000000..fa0587f1da931
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+;
+; Test vectorization and reassociation of fadd operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_4_addends_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  ret double %add5
+}
+
+define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+  ret double %add13
+}
+
+define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
+  ret float %add29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
new file mode 100644
index 0000000000000..5ea777e1c9a10
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmin/fmax operations. Vectorization
+; is more profitable if the loads are also vectorizable.
+
+define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <12 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <12 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <12 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <12 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <12 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <12 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <12 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <12 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <12 x float> [[TMP8]], float [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <12 x float> [[TMP9]], float [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <12 x float> [[TMP10]], float [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <12 x float> [[TMP11]], float [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.v12f32(<12 x float> [[TMP12]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <12 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <12 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <12 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <12 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <12 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <12 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <12 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <12 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <12 x float> [[TMP8]], float [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <12 x float> [[TMP9]], float [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <12 x float> [[TMP10]], float [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <12 x float> [[TMP11]], float [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmax.v12f32(<12 x float> [[TMP12]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
new file mode 100644
index 0000000000000..e08b38c69a840
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmul operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_4_factors_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  ret double %mul5
+}
+
+define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+  ret double %mul13
+}
+
+define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
+; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
+  ret float %mul29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
index 607d7f7888784..c029781142af3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, ptr, ptr, ptr, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] }
 %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 }
 
-define fastcc void @LzmaDec_DecodeReal2(ptr %p) {
+define fastcc void @LzmaDec_DecodeReal2(ptr %p, i1 %arg) {
 ; CHECK-LABEL: @LzmaDec_DecodeReal2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], ptr [[P:%.*]], i64 0, i32 4
@@ -15,13 +15,13 @@ define fastcc void @LzmaDec_DecodeReal2(ptr %p) {
 ; CHECK:       do.body66.i:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]]
-; CHECK-NEXT:    br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]]
 ; CHECK:       if.else.i:
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], undef
 ; CHECK-NEXT:    br label [[DO_COND_I]]
 ; CHECK:       do.cond.i:
 ; CHECK-NEXT:    [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP1]], [[DO_BODY66_I]] ]
-; CHECK-NEXT:    br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]]
 ; CHECK:       do.end1006.i:
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP3]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[RANGE20_I]], align 4
@@ -37,7 +37,7 @@ do.body66.i:                                      ; preds = %do.cond.i, %entry
   %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ]
   %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i
   %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i
-  br i1 undef, label %do.cond.i, label %if.else.i
+  br i1 %arg, label %do.cond.i, label %if.else.i
 
 if.else.i:                                        ; preds = %do.body66.i
   %sub91.i = sub i32 %.range.2.i, undef
@@ -47,7 +47,7 @@ if.else.i:                                        ; preds = %do.body66.i
 do.cond.i:                                        ; preds = %if.else.i, %do.body66.i
   %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ]
   %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ]
-  br i1 undef, label %do.body66.i, label %do.end1006.i
+  br i1 %arg, label %do.body66.i, label %do.end1006.i
 
 do.end1006.i:                                     ; preds = %do.cond.i
   %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
index f1f83c0663099..291edbbc925bd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
@@ -6,17 +6,17 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 }
 
-define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(ptr nocapture %info) {
+define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(ptr nocapture %info, i1 %arg) {
 ; CHECK-LABEL: @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.else:
 ; CHECK-NEXT:    [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", ptr [[INFO:%.*]], i64 0, i32 1
-; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]]
 ; CHECK:       land.lhs.true.i.1:
-; CHECK-NEXT:    br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]]
 ; CHECK:       if.then7.1:
 ; CHECK-NEXT:    store <2 x i32> <i32 1, i32 5>, ptr [[INFO]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC_1]]
@@ -30,17 +30,17 @@ define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btCons
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  br i1 undef, label %if.else, label %if.then
+  br i1 %arg, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
   ret void
 
 if.else:                                          ; preds = %entry
   %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", ptr %info, i64 0, i32 1
-  br i1 undef, label %land.lhs.true.i.1, label %if.then7.1
+  br i1 %arg, label %land.lhs.true.i.1, label %if.then7.1
 
 land.lhs.true.i.1:                                ; preds = %if.else
-  br i1 undef, label %for.inc.1, label %if.then7.1
+  br i1 %arg, label %for.inc.1, label %if.then7.1
 
 if.then7.1:                                       ; preds = %land.lhs.true.i.1, %if.else
   %inc.1 = add nsw i32 0, 1
@@ -63,7 +63,7 @@ for.inc.1:                                        ; preds = %if.then7.1, %land.l
 %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] }
 %class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 }
 
-define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(ptr %this) {
+define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(ptr %this, i1 %arg) {
 ; CHECK-LABEL: @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], ptr [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1
@@ -76,9 +76,9 @@ define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector
 ; CHECK-NEXT:    store float [[TMP4]], ptr undef, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]]
 ; CHECK:       if.then1595:
-; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]]
 ; CHECK:       for.body.lr.ph.i.i1702:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.else1609:
@@ -99,10 +99,10 @@ entry:
   %sub639 = fsub float %add626, undef
   %sub652 = fsub float %add626, %sub639
   store float %sub652, ptr %arrayidx36, align 4
-  br i1 undef, label %if.else1609, label %if.then1595
+  br i1 %arg, label %if.else1609, label %if.then1595
 
 if.then1595:                                      ; preds = %entry
-  br i1 undef, label %return, label %for.body.lr.ph.i.i1702
+  br i1 %arg, label %return, label %for.body.lr.ph.i.i1702
 
 for.body.lr.ph.i.i1702:                           ; preds = %if.then1595
   unreachable
@@ -114,34 +114,34 @@ return:                                           ; preds = %if.then1595
   ret void
 }
 
-define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE() {
+define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE(i1 %arg) {
 ; CHECK-LABEL: @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END111:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END111:%.*]]
 ; CHECK:       if.end111:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END136:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END136:%.*]]
 ; CHECK:       if.end136:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END162:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END162:%.*]]
 ; CHECK:       if.end162:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END189:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END189:%.*]]
 ; CHECK:       if.end189:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END216:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END216:%.*]]
 ; CHECK:       if.end216:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN218:%.*]], label [[IF_END225:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN218:%.*]], label [[IF_END225:%.*]]
 ; CHECK:       if.then218:
 ; CHECK-NEXT:    br label [[IF_END225]]
 ; CHECK:       if.end225:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END248:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END248:%.*]]
 ; CHECK:       if.end248:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END304:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END304:%.*]]
 ; CHECK:       if.end304:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END361:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END361:%.*]]
 ; CHECK:       if.end361:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN370:%.*]], label [[IF_END395:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN370:%.*]], label [[IF_END395:%.*]]
 ; CHECK:       if.then370:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN374:%.*]], label [[IF_END395]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN374:%.*]], label [[IF_END395]]
 ; CHECK:       if.then374:
 ; CHECK-NEXT:    br label [[IF_END395]]
 ; CHECK:       if.end395:
@@ -152,47 +152,47 @@ define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36b
 entry:
   %add8.i2343 = fadd float undef, undef
   %add8.i2381 = fadd float undef, undef
-  br i1 undef, label %return, label %if.end
+  br i1 %arg, label %return, label %if.end
 
 if.end:                                           ; preds = %entry
-  br i1 undef, label %return, label %if.end111
+  br i1 %arg, label %return, label %if.end111
 
 if.end111:                                        ; preds = %if.end
-  br i1 undef, label %return, label %if.end136
+  br i1 %arg, label %return, label %if.end136
 
 if.end136:                                        ; preds = %if.end111
-  br i1 undef, label %return, label %if.end162
+  br i1 %arg, label %return, label %if.end162
 
 if.end162:                                        ; preds = %if.end136
-  br i1 undef, label %return, label %if.end189
+  br i1 %arg, label %return, label %if.end189
 
 if.end189:                                        ; preds = %if.end162
-  br i1 undef, label %return, label %if.end216
+  br i1 %arg, label %return, label %if.end216
 
 if.end216:                                        ; preds = %if.end189
-  br i1 undef, label %if.then218, label %if.end225
+  br i1 %arg, label %if.then218, label %if.end225
 
 if.then218:                                       ; preds = %if.end216
   br label %if.end225
 
 if.end225:                                        ; preds = %if.then218, %if.end216
-  br i1 undef, label %return, label %if.end248
+  br i1 %arg, label %return, label %if.end248
 
 if.end248:                                        ; preds = %if.end225
-  br i1 undef, label %return, label %if.end304
+  br i1 %arg, label %return, label %if.end304
 
 if.end304:                                        ; preds = %if.end248
   %mul341 = fmul float undef, %add8.i2343
   %mul344 = fmul float undef, %add8.i2381
   %sub345 = fsub float %mul341, %mul344
-  br i1 undef, label %return, label %if.end361
+  br i1 %arg, label %return, label %if.end361
 
 if.end361:                                        ; preds = %if.end304
   %mul364 = fmul float %add8.i2381, %add8.i2381
-  br i1 undef, label %if.then370, label %if.end395
+  br i1 %arg, label %if.then370, label %if.end395
 
 if.then370:                                       ; preds = %if.end361
-  br i1 undef, label %if.then374, label %if.end395
+  br i1 %arg, label %if.then374, label %if.end395
 
 if.then374:                                       ; preds = %if.then370
   %cmp392 = fcmp olt float %sub345, 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index 925b348cdeec1..55e691b39d78c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -7,36 +7,36 @@ target triple = "x86_64-apple-macosx10.8.0"
 %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
 
 ; Function Attrs: ssp uwtable
-define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices) #0 align 2 {
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) #0 align 2 {
 ; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]]
 ; CHECK:       for.end36:
 ; CHECK-NEXT:    br label [[FOR_BODY144:%.*]]
 ; CHECK:       for.body144:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END227:%.*]], label [[FOR_BODY144]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END227:%.*]], label [[FOR_BODY144]]
 ; CHECK:       for.end227:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]]
 ; CHECK:       for.body233:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY233]], label [[FOR_END271]]
 ; CHECK:       for.end271:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ splat (float 0x47EFFFFFE0000000), [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> undef, [[TMP0]]
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN291:%.*]], label [[RETURN]]
 ; CHECK:       if.then291:
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], splat (float 5.000000e-01)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
 ; CHECK:       if.else319:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
 ; CHECK:       if.then325:
 ; CHECK-NEXT:    br label [[IF_END327]]
 ; CHECK:       if.end327:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN329:%.*]], label [[IF_END332]]
 ; CHECK:       if.then329:
 ; CHECK-NEXT:    br label [[IF_END332]]
 ; CHECK:       if.end332:
@@ -49,55 +49,55 @@ define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %ve
 ; CHECK:       if.then17.1:
 ; CHECK-NEXT:    br label [[IF_END22_1]]
 ; CHECK:       if.end22.1:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]]
 ; CHECK:       if.then17.2:
 ; CHECK-NEXT:    br label [[IF_END22_2]]
 ; CHECK:       if.end22.2:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END36:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END36:%.*]], label [[FOR_BODY]]
 ;
 entry:
-  br i1 undef, label %return, label %if.end
+  br i1 %arg, label %return, label %if.end
 
 if.end:                                           ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %if.end22.2, %if.end
-  br i1 undef, label %if.then17.1, label %if.end22.1
+  br i1 %arg, label %if.then17.1, label %if.end22.1
 
 for.end36:                                        ; preds = %if.end22.2
   br label %for.body144
 
 for.body144:                                      ; preds = %for.body144, %for.end36
-  br i1 undef, label %for.end227, label %for.body144
+  br i1 %arg, label %for.end227, label %for.body144
 
 for.end227:                                       ; preds = %for.body144
-  br i1 undef, label %for.end271, label %for.body233
+  br i1 %arg, label %for.end271, label %for.body233
 
 for.body233:                                      ; preds = %for.body233, %for.end227
-  br i1 undef, label %for.body233, label %for.end271
+  br i1 %arg, label %for.body233, label %for.end271
 
 for.end271:                                       ; preds = %for.body233, %for.end227
   %0 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
   %1 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
   %sub275 = fsub float undef, %1
   %sub279 = fsub float undef, %0
-  br i1 undef, label %if.then291, label %return
+  br i1 %arg, label %if.then291, label %return
 
 if.then291:                                       ; preds = %for.end271
   %mul292 = fmul float %sub275, 5.000000e-01
   %add294 = fadd float %1, %mul292
   %mul295 = fmul float %sub279, 5.000000e-01
   %add297 = fadd float %0, %mul295
-  br i1 undef, label %if.end332, label %if.else319
+  br i1 %arg, label %if.end332, label %if.else319
 
 if.else319:                                       ; preds = %if.then291
-  br i1 undef, label %if.then325, label %if.end327
+  br i1 %arg, label %if.then325, label %if.end327
 
 if.then325:                                       ; preds = %if.else319
   br label %if.end327
 
 if.end327:                                        ; preds = %if.then325, %if.else319
-  br i1 undef, label %if.then329, label %if.end332
+  br i1 %arg, label %if.then329, label %if.end332
 
 if.then329:                                       ; preds = %if.end327
   br label %if.end332
@@ -119,13 +119,13 @@ if.then17.1:                                      ; preds = %for.body
   br label %if.end22.1
 
 if.end22.1:                                       ; preds = %if.then17.1, %for.body
-  br i1 undef, label %if.then17.2, label %if.end22.2
+  br i1 %arg, label %if.then17.2, label %if.end22.2
 
 if.then17.2:                                      ; preds = %if.end22.1
   br label %if.end22.2
 
 if.end22.2:                                       ; preds = %if.then17.2, %if.end22.1
-  br i1 undef, label %for.end36, label %for.body
+  br i1 %arg, label %for.end36, label %for.body
 }
 
 attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
index 4de16a5d57793..faf4496ce2722 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -6,23 +6,23 @@ target triple = "x86_64-apple-macosx10.8.0"
 %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { ptr, ptr, ptr, ptr }
 
 ; Function Attrs: nounwind ssp uwtable
-define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr nocapture %__last) {
+define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr nocapture %__last, i1 %arg) {
 ; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__FIRST:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x ptr>, ptr [[__LAST:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[TMP0]], i32 0
-; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
 ; CHECK:       while.cond.i.preheader:
 ; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
 ; CHECK:       while.cond.i:
-; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
-; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
+; CHECK-NEXT:    br i1 %arg, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
 ; CHECK:       _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x ptr> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
 ; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[__FIRST]], align 8
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
 ; CHECK:       if.then.i55:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
 ; CHECK:       while.cond:
@@ -34,23 +34,23 @@ entry:
   %1 = load ptr, ptr %__last, align 8
   %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr %__last, i64 0, i32 1
   %2 = load ptr, ptr %_M_first3.i.i83, align 8
-  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader
+  br i1 %arg, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader
 
 while.cond.i.preheader:                           ; preds = %entry
   br label %while.cond.i
 
 while.cond.i:                                     ; preds = %while.body.i, %while.cond.i.preheader
-  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i
+  br i1 %arg, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i
 
 while.body.i:                                     ; preds = %while.cond.i
-  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i
+  br i1 %arg, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i
 
 _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry
   %3 = phi ptr [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ]
   %4 = phi ptr [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ]
   store ptr %4, ptr %__first, align 8
   store ptr %3, ptr %_M_first3.i.i, align 8
-  br i1 undef, label %if.then.i55, label %while.cond
+  br i1 %arg, label %if.then.i55, label %while.cond
 
 if.then.i55:                                      ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
   br label %while.cond
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
index 371b06869841b..fc1bd856da9c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -5,20 +5,20 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @main() #0 {
+define void @main(i1 %arg) #0 {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       while.end:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END80:%.*]], label [[FOR_BODY75_LR_PH:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END80:%.*]], label [[FOR_BODY75_LR_PH:%.*]]
 ; CHECK:       for.body75.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY75:%.*]]
 ; CHECK:       for.body75:
 ; CHECK-NEXT:    br label [[FOR_BODY75]]
 ; CHECK:       for.end80:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END300:%.*]], label [[FOR_BODY267_LR_PH:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END300:%.*]], label [[FOR_BODY267_LR_PH:%.*]]
 ; CHECK:       for.body267.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY267:%.*]]
 ; CHECK:       for.body267:
@@ -32,18 +32,18 @@ define void @main() #0 {
 ; CHECK-NEXT:    [[ADD295:%.*]] = fadd double undef, [[MUL294]]
 ; CHECK-NEXT:    [[DIV296:%.*]] = fdiv double [[MUL283]], [[ADD295]]
 ; CHECK-NEXT:    [[ADD297]] = fadd double [[S_71010]], [[DIV296]]
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY267]], label [[FOR_END300]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY267]], label [[FOR_END300]]
 ; CHECK:       for.end300:
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  br i1 undef, label %while.body, label %while.end
+  br i1 %arg, label %while.body, label %while.end
 
 while.body:                                       ; preds = %entry
   unreachable
 
 while.end:                                        ; preds = %entry
-  br i1 undef, label %for.end80, label %for.body75.lr.ph
+  br i1 %arg, label %for.end80, label %for.body75.lr.ph
 
 for.body75.lr.ph:                                 ; preds = %while.end
   br label %for.body75
@@ -52,7 +52,7 @@ for.body75:                                       ; preds = %for.body75, %for.bo
   br label %for.body75
 
 for.end80:                                        ; preds = %while.end
-  br i1 undef, label %for.end300, label %for.body267.lr.ph
+  br i1 %arg, label %for.end300, label %for.body267.lr.ph
 
 for.body267.lr.ph:                                ; preds = %for.end80
   br label %for.body267
@@ -68,7 +68,7 @@ for.body267:                                      ; preds = %for.body267, %for.b
   %add295 = fadd double undef, %mul294
   %div296 = fdiv double %mul283, %add295
   %add297 = fadd double %s.71010, %div296
-  br i1 undef, label %for.body267, label %for.end300
+  br i1 %arg, label %for.body267, label %for.end300
 
 for.end300:                                       ; preds = %for.body267, %for.end80
   unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod-inseltpoison.ll
index 6ac588524f845..d516126d8412d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod-inseltpoison.ll
@@ -5,34 +5,34 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @RCModelEstimator() {
+define void @RCModelEstimator(i1 %arg) {
 ; CHECK-LABEL: @RCModelEstimator(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]]
 ; CHECK:       for.end.thread:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]]
 ; CHECK:       for.cond14.preheader:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]]
 ; CHECK:       for.body16.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]]
 ; CHECK:       if.then7:
 ; CHECK-NEXT:    br label [[FOR_INC11]]
 ; CHECK:       for.inc11:
 ; CHECK-NEXT:    br i1 false, label [[FOR_COND14_PREHEADER:%.*]], label [[FOR_BODY3]]
 ; CHECK:       for.body16:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END39:%.*]], label [[FOR_BODY16]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_BODY16]]
 ; CHECK:       for.end39:
-; CHECK-NEXT:    br i1 undef, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]]
 ; CHECK:       for.cond45.preheader:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then88:
 ; CHECK-NEXT:    br label [[IF_END103]]
 ; CHECK:       if.else:
@@ -41,28 +41,28 @@ define void @RCModelEstimator() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %for.body.lr.ph, label %for.end.thread
+  br i1 %arg, label %for.body.lr.ph, label %for.end.thread
 
 for.end.thread:                                   ; preds = %entry
   unreachable
 
 for.body.lr.ph:                                   ; preds = %entry
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %for.body.lr.ph
-  br i1 undef, label %for.body3, label %if.end103
+  br i1 %arg, label %for.body3, label %if.end103
 
 for.cond14.preheader:                             ; preds = %for.inc11
-  br i1 undef, label %for.body16.lr.ph, label %if.end103
+  br i1 %arg, label %for.body16.lr.ph, label %if.end103
 
 for.body16.lr.ph:                                 ; preds = %for.cond14.preheader
   br label %for.body16
 
 for.body3:                                        ; preds = %for.inc11, %for.end
-  br i1 undef, label %if.then7, label %for.inc11
+  br i1 %arg, label %if.then7, label %for.inc11
 
 if.then7:                                         ; preds = %for.body3
   br label %for.inc11
@@ -71,13 +71,13 @@ for.inc11:                                        ; preds = %if.then7, %for.body
   br i1 false, label %for.cond14.preheader, label %for.body3
 
 for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
-  br i1 undef, label %for.end39, label %for.body16
+  br i1 %arg, label %for.end39, label %for.body16
 
 for.end39:                                        ; preds = %for.body16
-  br i1 undef, label %if.end103, label %for.cond45.preheader
+  br i1 %arg, label %if.end103, label %for.cond45.preheader
 
 for.cond45.preheader:                             ; preds = %for.end39
-  br i1 undef, label %if.then88, label %if.else
+  br i1 %arg, label %if.then88, label %if.else
 
 if.then88:                                        ; preds = %for.cond45.preheader
   %mul89 = fmul double 0.000000e+00, 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
index f0d38e1bf269c..3ad0473c84766 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -5,34 +5,34 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @RCModelEstimator() {
+define void @RCModelEstimator(i1 %arg) {
 ; CHECK-LABEL: @RCModelEstimator(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]]
 ; CHECK:       for.end.thread:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]]
 ; CHECK:       for.cond14.preheader:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]]
 ; CHECK:       for.body16.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.body3:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]]
 ; CHECK:       if.then7:
 ; CHECK-NEXT:    br label [[FOR_INC11]]
 ; CHECK:       for.inc11:
 ; CHECK-NEXT:    br i1 false, label [[FOR_COND14_PREHEADER:%.*]], label [[FOR_BODY3]]
 ; CHECK:       for.body16:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END39:%.*]], label [[FOR_BODY16]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END39:%.*]], label [[FOR_BODY16]]
 ; CHECK:       for.end39:
-; CHECK-NEXT:    br i1 undef, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]]
 ; CHECK:       for.cond45.preheader:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then88:
 ; CHECK-NEXT:    br label [[IF_END103]]
 ; CHECK:       if.else:
@@ -41,28 +41,28 @@ define void @RCModelEstimator() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %for.body.lr.ph, label %for.end.thread
+  br i1 %arg, label %for.body.lr.ph, label %for.end.thread
 
 for.end.thread:                                   ; preds = %entry
   unreachable
 
 for.body.lr.ph:                                   ; preds = %entry
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %for.body.lr.ph
-  br i1 undef, label %for.body3, label %if.end103
+  br i1 %arg, label %for.body3, label %if.end103
 
 for.cond14.preheader:                             ; preds = %for.inc11
-  br i1 undef, label %for.body16.lr.ph, label %if.end103
+  br i1 %arg, label %for.body16.lr.ph, label %if.end103
 
 for.body16.lr.ph:                                 ; preds = %for.cond14.preheader
   br label %for.body16
 
 for.body3:                                        ; preds = %for.inc11, %for.end
-  br i1 undef, label %if.then7, label %for.inc11
+  br i1 %arg, label %if.then7, label %for.inc11
 
 if.then7:                                         ; preds = %for.body3
   br label %for.inc11
@@ -71,13 +71,13 @@ for.inc11:                                        ; preds = %if.then7, %for.body
   br i1 false, label %for.cond14.preheader, label %for.body3
 
 for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
-  br i1 undef, label %for.end39, label %for.body16
+  br i1 %arg, label %for.end39, label %for.body16
 
 for.end39:                                        ; preds = %for.body16
-  br i1 undef, label %if.end103, label %for.cond45.preheader
+  br i1 %arg, label %if.end103, label %for.cond45.preheader
 
 for.cond45.preheader:                             ; preds = %for.end39
-  br i1 undef, label %if.then88, label %if.else
+  br i1 %arg, label %if.then88, label %if.else
 
 if.then88:                                        ; preds = %for.cond45.preheader
   %mul89 = fmul double 0.000000e+00, 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
index e6c46e1847dac..403a610da8d55 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define void @main() {
+define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -27,17 +27,17 @@ define void @main() {
 ; CHECK-NEXT:    [[ADD19]] = fadd double undef, [[MUL18]]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]]
 ; CHECK-NEXT:    [[ADD20]] = fadd double undef, [[SUB]]
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY12]], label [[FOR_INC21]]
 ; CHECK:       for.inc21:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END23:%.*]], label [[FOR_BODY6]]
 ; CHECK:       for.end23:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN25:%.*]], label [[IF_THEN26:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN25:%.*]], label [[IF_THEN26:%.*]]
 ; CHECK:       if.then25:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END44:%.*]], label [[FOR_COND4_PREHEADER]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END44:%.*]], label [[FOR_COND4_PREHEADER]]
 ; CHECK:       if.then26:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.end44:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END48:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END48:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end48:
 ; CHECK-NEXT:    ret void
 ;
@@ -67,22 +67,22 @@ if.end:                                           ; preds = %for.body12
   %add19 = fadd double undef, %mul18
   %sub = fsub double %mul13, %mul14
   %add20 = fadd double undef, %sub
-  br i1 undef, label %for.body12, label %for.inc21
+  br i1 %arg, label %for.body12, label %for.inc21
 
 for.inc21:                                        ; preds = %if.end, %for.body12
-  br i1 undef, label %for.end23, label %for.body6
+  br i1 %arg, label %for.end23, label %for.body6
 
 for.end23:                                        ; preds = %for.inc21
-  br i1 undef, label %if.then25, label %if.then26
+  br i1 %arg, label %if.then25, label %if.then26
 
 if.then25:                                        ; preds = %for.end23
-  br i1 undef, label %for.end44, label %for.cond4.preheader
+  br i1 %arg, label %for.end44, label %for.cond4.preheader
 
 if.then26:                                        ; preds = %for.end23
   unreachable
 
 for.end44:                                        ; preds = %if.then25
-  br i1 undef, label %for.end48, label %for.body
+  br i1 %arg, label %for.end48, label %for.body
 
 for.end48:                                        ; preds = %for.end44
   ret void
@@ -90,7 +90,7 @@ for.end48:                                        ; preds = %for.end44
 
 %struct.hoge = type { double, double, double}
 
-define void @zot(ptr %arg) {
+define void @zot(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = load double, ptr undef, align 8
@@ -102,7 +102,7 @@ define void @zot(ptr %arg) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]]
+; CHECK-NEXT:    br i1 %arg2, label [[BB11:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    br label [[BB14:%.*]]
 ; CHECK:       bb12:
@@ -124,7 +124,7 @@ bb:
   %tmp9 = fsub double %tmp8, undef
   %tmp10 = getelementptr inbounds %struct.hoge, ptr %arg, i64 0, i32 2
   store double %tmp9, ptr %tmp10, align 8
-  br i1 undef, label %bb11, label %bb12
+  br i1 %arg2, label %bb11, label %bb12
 
 bb11:                                             ; preds = %bb
   br label %bb14
@@ -140,15 +140,15 @@ bb14:                                             ; preds = %bb12, %bb11
 
 %struct.rc4_state.0.24 = type { i32, i32, [256 x i32] }
 
-define void @rc4_crypt(ptr nocapture %s) {
+define void @rc4_crypt(ptr nocapture %s, i1 %arg) {
 ; CHECK-LABEL: @rc4_crypt(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y2:%.*]] = getelementptr inbounds [[STRUCT_RC4_STATE_0_24:%.*]], ptr [[S:%.*]], i64 0, i32 1
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[CONV4:%.*]] = and i32 undef, 255
 ; CHECK-NEXT:    [[CONV7:%.*]] = and i32 undef, 255
-; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[CONV4]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[Y_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[CONV7]], [[FOR_BODY]] ]
@@ -158,14 +158,14 @@ define void @rc4_crypt(ptr nocapture %s) {
 ;
 entry:
   %y2 = getelementptr inbounds %struct.rc4_state.0.24, ptr %s, i64 0, i32 1
-  br i1 undef, label %for.body, label %for.end
+  br i1 %arg, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.body, %entry
   %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ]
   %conv4 = and i32 undef, 255
   %conv7 = and i32 undef, 255
   %idxprom842 = zext i32 %conv7 to i64
-  br i1 undef, label %for.end, label %for.body
+  br i1 %arg, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
   %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
index a9f92f324d6f5..d434035051f5e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
@@ -6,28 +6,28 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] }
 
-define void @SIM4() {
+define void @SIM4(i1 %arg) {
 ; CHECK-LABEL: @SIM4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
 ; CHECK:       lor.lhs.false:
-; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[RETURN]], label [[IF_END:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END605:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END605:%.*]], label [[FOR_BODY_LR_PH:%.*]]
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 undef, label [[FOR_INC603:%.*]], label [[IF_END12:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_INC603:%.*]], label [[IF_END12:%.*]]
 ; CHECK:       if.end12:
-; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE:%.*]], label [[LAND_LHS_TRUE167:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LAND_LHS_TRUE:%.*]], label [[LAND_LHS_TRUE167:%.*]]
 ; CHECK:       land.lhs.true:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN17:%.*]], label [[LAND_LHS_TRUE167]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN17:%.*]], label [[LAND_LHS_TRUE167]]
 ; CHECK:       if.then17:
-; CHECK-NEXT:    br i1 undef, label [[IF_END98:%.*]], label [[LAND_RHS_LR_PH:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END98:%.*]], label [[LAND_RHS_LR_PH:%.*]]
 ; CHECK:       land.rhs.lr.ph:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end98:
-; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]]
 ; CHECK:       if.then103:
 ; CHECK-NEXT:    [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef
 ; CHECK-NEXT:    [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2
@@ -37,11 +37,11 @@ define void @SIM4() {
 ; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 ; CHECK:       for.cond.i:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ]
-; CHECK-NEXT:    br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]]
 ; CHECK:       land.rhs.i874:
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_COND_I]], label [[FOR_END_I]]
 ; CHECK:       for.end.i:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
 ; CHECK:       if.then.i:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef
 ; CHECK-NEXT:    br label [[EXTEND_BW_EXIT:%.*]]
@@ -52,15 +52,15 @@ define void @SIM4() {
 ; CHECK:       for.body28.lr.ph.i:
 ; CHECK-NEXT:    br label [[FOR_END33_I]]
 ; CHECK:       for.end33.i:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END58_I:%.*]], label [[FOR_BODY52_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END58_I:%.*]], label [[FOR_BODY52_LR_PH_I:%.*]]
 ; CHECK:       for.body52.lr.ph.i:
 ; CHECK-NEXT:    br label [[FOR_END58_I]]
 ; CHECK:       for.end58.i:
 ; CHECK-NEXT:    br label [[WHILE_COND260_I:%.*]]
 ; CHECK:       while.cond260.i:
-; CHECK-NEXT:    br i1 undef, label [[LAND_RHS263_I:%.*]], label [[WHILE_END275_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[LAND_RHS263_I:%.*]], label [[WHILE_END275_I:%.*]]
 ; CHECK:       land.rhs263.i:
-; CHECK-NEXT:    br i1 undef, label [[WHILE_COND260_I]], label [[WHILE_END275_I]]
+; CHECK-NEXT:    br i1 %arg, label [[WHILE_COND260_I]], label [[WHILE_END275_I]]
 ; CHECK:       while.end275.i:
 ; CHECK-NEXT:    br label [[EXTEND_BW_EXIT]]
 ; CHECK:       extend_bw.exit:
@@ -73,42 +73,42 @@ define void @SIM4() {
 ; CHECK:       land.lhs.true167:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.inc603:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END605]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY]], label [[FOR_END605]]
 ; CHECK:       for.end605:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %return, label %lor.lhs.false
+  br i1 %arg, label %return, label %lor.lhs.false
 
 lor.lhs.false:                                    ; preds = %entry
-  br i1 undef, label %return, label %if.end
+  br i1 %arg, label %return, label %if.end
 
 if.end:                                           ; preds = %lor.lhs.false
-  br i1 undef, label %for.end605, label %for.body.lr.ph
+  br i1 %arg, label %for.end605, label %for.body.lr.ph
 
 for.body.lr.ph:                                   ; preds = %if.end
   br label %for.body
 
 for.body:                                         ; preds = %for.inc603, %for.body.lr.ph
-  br i1 undef, label %for.inc603, label %if.end12
+  br i1 %arg, label %for.inc603, label %if.end12
 
 if.end12:                                         ; preds = %for.body
-  br i1 undef, label %land.lhs.true, label %land.lhs.true167
+  br i1 %arg, label %land.lhs.true, label %land.lhs.true167
 
 land.lhs.true:                                    ; preds = %if.end12
-  br i1 undef, label %if.then17, label %land.lhs.true167
+  br i1 %arg, label %if.then17, label %land.lhs.true167
 
 if.then17:                                        ; preds = %land.lhs.true
-  br i1 undef, label %if.end98, label %land.rhs.lr.ph
+  br i1 %arg, label %if.end98, label %land.rhs.lr.ph
 
 land.rhs.lr.ph:                                   ; preds = %if.then17
   unreachable
 
 if.end98:                                         ; preds = %if.then17
   %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171, ptr undef, i64 0, i32 1
-  br i1 undef, label %land.lhs.true167, label %if.then103
+  br i1 %arg, label %land.lhs.true167, label %if.then103
 
 if.then103:                                       ; preds = %if.end98
   %.sub100 = select i1 undef, i32 250, i32 undef
@@ -119,13 +119,13 @@ if.then103:                                       ; preds = %if.end98
 for.cond.i:                                       ; preds = %land.rhs.i874, %if.then103
   %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ]
   %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ]
-  br i1 undef, label %land.rhs.i874, label %for.end.i
+  br i1 %arg, label %land.rhs.i874, label %for.end.i
 
 land.rhs.i874:                                    ; preds = %for.cond.i
-  br i1 undef, label %for.cond.i, label %for.end.i
+  br i1 %arg, label %for.cond.i, label %for.end.i
 
 for.end.i:                                        ; preds = %land.rhs.i874, %for.cond.i
-  br i1 undef, label %if.then.i, label %if.end.i
+  br i1 %arg, label %if.then.i, label %if.end.i
 
 if.then.i:                                        ; preds = %for.end.i
   %add14.i = add nsw i32 %row.0.i, undef
@@ -141,7 +141,7 @@ for.body28.lr.ph.i:                               ; preds = %if.end.i
   br label %for.end33.i
 
 for.end33.i:                                      ; preds = %for.body28.lr.ph.i, %if.end.i
-  br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i
+  br i1 %arg, label %for.end58.i, label %for.body52.lr.ph.i
 
 for.body52.lr.ph.i:                               ; preds = %for.end33.i
   br label %for.end58.i
@@ -150,10 +150,10 @@ for.end58.i:                                      ; preds = %for.body52.lr.ph.i,
   br label %while.cond260.i
 
 while.cond260.i:                                  ; preds = %land.rhs263.i, %for.end58.i
-  br i1 undef, label %land.rhs263.i, label %while.end275.i
+  br i1 %arg, label %land.rhs263.i, label %while.end275.i
 
 land.rhs263.i:                                    ; preds = %while.cond260.i
-  br i1 undef, label %while.cond260.i, label %while.end275.i
+  br i1 %arg, label %while.cond260.i, label %while.end275.i
 
 while.end275.i:                                   ; preds = %land.rhs263.i, %while.cond260.i
   br label %extend_bw.exit
@@ -174,7 +174,7 @@ land.lhs.true167:                                 ; preds = %if.then157, %extend
   unreachable
 
 for.inc603:                                       ; preds = %for.body
-  br i1 undef, label %for.body, label %for.end605
+  br i1 %arg, label %for.body, label %for.end605
 
 for.end605:                                       ; preds = %for.inc603, %if.end
   unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index 42ad20ff578c1..d13a8578d1e00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -4,25 +4,25 @@
 %struct.Ray = type { %struct.Vec, %struct.Vec }
 %struct.Vec = type { double, double, double }
 
-define void @main() {
+define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 undef, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 undef, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
@@ -36,12 +36,12 @@ define void @main() {
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 undef, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
 entry:
-  br i1 undef, label %cond.true, label %cond.end
+  br i1 %arg, label %cond.true, label %cond.end
 
 cond.true:
   unreachable
@@ -50,7 +50,7 @@ cond.end:
   br label %invoke.cont
 
 invoke.cont:
-  br i1 undef, label %arrayctor.cont, label %invoke.cont
+  br i1 %arg, label %arrayctor.cont, label %invoke.cont
 
 arrayctor.cont:
   %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray, ptr undef, i64 0, i32 0, i32 1
@@ -59,13 +59,13 @@ arrayctor.cont:
   br label %for.cond36.preheader
 
 for.cond36.preheader:
-  br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1
+  br i1 %arg, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1
 
 cond.false51.us:
   unreachable
 
 cond.true48.us:
-  br i1 undef, label %cond.true63.us, label %cond.false66.us
+  br i1 %arg, label %cond.true63.us, label %cond.false66.us
 
 cond.false66.us:
   %add.i276.us = fadd double 0.000000e+00, 0.000001e+00
@@ -87,16 +87,16 @@ cond.true63.us:
   unreachable
 
 for.body42.lr.ph.us:
-  br i1 undef, label %cond.true48.us, label %cond.false51.us
+  br i1 %arg, label %cond.true48.us, label %cond.false51.us
 
 _Z5clampd.exit.1:
   br label %for.cond36.preheader
 }
 
-define void @test() {
+define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
@@ -105,7 +105,7 @@ define void @test() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %if.then78, label %if.then38
+  br i1 %arg, label %if.then78, label %if.then38
 
 if.then38:
   %mul.i.i790 = fmul double 0.0, 0.1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
index e3a860a4c6f06..c7c4b06be2d19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 
 
 ;define fastcc void @bar() {
-define void @bar() {
+define void @bar(i1 %arg) {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr undef, i64 0, i32 1, i32 0
@@ -32,16 +32,16 @@ define void @bar() {
 ; CHECK-NEXT:    store double [[I7]], ptr [[I1]], align 8
 ; CHECK-NEXT:    [[I10]] = load double, ptr [[I3]], align 8
 ; CHECK-NEXT:    [[TMP0]] = load <2 x double>, ptr [[I2]], align 8
-; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB11:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb11:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb12:
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[I4]], align 8
-; CHECK-NEXT:    br i1 undef, label [[BB13:%.*]], label [[BB14:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB13:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    br label [[BB14]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    br i1 undef, label [[BB15:%.*]], label [[BB16]]
+; CHECK-NEXT:    br i1 %arg, label [[BB15:%.*]], label [[BB16]]
 ; CHECK:       bb15:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb16:
@@ -51,7 +51,7 @@ define void @bar() {
 ; CHECK-NEXT:      i32 103, label [[BB6]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb17:
-; CHECK-NEXT:    br i1 undef, label [[BB6]], label [[BB18:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB6]], label [[BB18:%.*]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    unreachable
 ;
@@ -71,7 +71,7 @@ bb6:                                              ; preds = %bb17, %bb16, %bb16,
   store double %i7, ptr %i1, align 8
   %i9 = load double, ptr %i2, align 8
   %i10 = load double, ptr %i3, align 8
-  br i1 undef, label %bb11, label %bb12
+  br i1 %arg, label %bb11, label %bb12
 
 bb11:                                             ; preds = %bb6
   ret void
@@ -79,13 +79,13 @@ bb11:                                             ; preds = %bb6
 bb12:                                             ; preds = %bb6
   store double %i9, ptr %i4, align 8
   store double %i10, ptr %i5, align 8
-  br i1 undef, label %bb13, label %bb14
+  br i1 %arg, label %bb13, label %bb14
 
 bb13:                                             ; preds = %bb12
   br label %bb14
 
 bb14:                                             ; preds = %bb13, %bb12
-  br i1 undef, label %bb15, label %bb16
+  br i1 %arg, label %bb15, label %bb16
 
 bb15:                                             ; preds = %bb14
   unreachable
@@ -97,7 +97,7 @@ bb16:                                             ; preds = %bb14
   ]
 
 bb17:                                             ; preds = %bb16
-  br i1 undef, label %bb6, label %bb18
+  br i1 %arg, label %bb6, label %bb18
 
 bb18:                                             ; preds = %bb17
   unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
index 739e3964c2685..7510b8fb83e34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -280,10 +280,10 @@ return:                                           ; preds = %entry, %if.end
 
 @a = external global double, align 8
 
-define void @PR19646(ptr %this) {
+define void @PR19646(ptr %this, i1 %arg) {
 ; CHECK-LABEL: @PR19646(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_END13:%.*]], label [[IF_END13]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END13:%.*]], label [[IF_END13]]
 ; CHECK:       sw.epilog7:
 ; CHECK-NEXT:    [[DOTIN:%.*]] = getelementptr inbounds [[CLASS_B_53_55:%.*]], ptr [[THIS:%.*]], i64 0, i32 0, i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[DOTIN]], align 8
@@ -294,7 +294,7 @@ define void @PR19646(ptr %this) {
 ; CHECK-NEXT:    [[_DY:%.*]] = getelementptr inbounds [[CLASS_B_53_55]], ptr [[THIS]], i64 0, i32 0, i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[_DY]], align 8
 ; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[ADD8]], [[TMP2]]
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN12:%.*]], label [[IF_END13]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN12:%.*]], label [[IF_END13]]
 ; CHECK:       if.then12:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    br label [[IF_END13]]
@@ -304,7 +304,7 @@ define void @PR19646(ptr %this) {
 ; CHECK-NEXT:    unreachable
 ;
 entry:
-  br i1 undef, label %if.end13, label %if.end13
+  br i1 %arg, label %if.end13, label %if.end13
 
 sw.epilog7:                                       ; No predecessors!
   %.in = getelementptr inbounds %class.B.53.55, ptr %this, i64 0, i32 0, i32 1
@@ -316,7 +316,7 @@ sw.epilog7:                                       ; No predecessors!
   %_dy = getelementptr inbounds %class.B.53.55, ptr %this, i64 0, i32 0, i32 2
   %2 = load double, ptr %_dy, align 8
   %add10 = fadd double %add8, %2
-  br i1 undef, label %if.then12, label %if.end13
+  br i1 %arg, label %if.then12, label %if.end13
 
 if.then12:                                        ; preds = %sw.epilog7
   %3 = load double, ptr undef, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
index f614796916baa..d474218e84cca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -116,7 +116,7 @@ then:
 %struct.wombat.0 = type { %struct.bar }
 %struct.bar = type { [3 x double], [3 x double], double, double, i16, ptr, i32, [3 x double] }
 
-define double @preserve_loop_info(ptr %arg) {
+define double @preserve_loop_info(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @preserve_loop_info(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca [3 x double], align 16
@@ -124,15 +124,15 @@ define double @preserve_loop_info(ptr %arg) {
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    br label [[INNER:%.*]]
 ; CHECK:       inner:
-; CHECK-NEXT:    br i1 undef, label [[OUTER_LATCH:%.*]], label [[INNER]]
+; CHECK-NEXT:    br i1 %arg2, label [[OUTER_LATCH:%.*]], label [[INNER]]
 ; CHECK:       outer.latch:
-; CHECK-NEXT:    br i1 undef, label [[BB:%.*]], label [[OUTER_HEADER]]
+; CHECK-NEXT:    br i1 %arg2, label [[BB:%.*]], label [[OUTER_HEADER]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr undef, align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x double], ptr [[TMP]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP_3HEADER:%.*]]
 ; CHECK:       loop.3header:
-; CHECK-NEXT:    br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9:%.*]]
+; CHECK-NEXT:    br i1 %arg2, label [[LOOP_3LATCH:%.*]], label [[BB9:%.*]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x double], ptr [[TMP5]], i64 undef, i64 1
 ; CHECK-NEXT:    store double undef, ptr [[TMP]], align 16
@@ -140,7 +140,7 @@ define double @preserve_loop_info(ptr %arg) {
 ; CHECK-NEXT:    store double [[TMP12]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    br label [[LOOP_3LATCH]]
 ; CHECK:       loop.3latch:
-; CHECK-NEXT:    br i1 undef, label [[BB14:%.*]], label [[LOOP_3HEADER]]
+; CHECK-NEXT:    br i1 %arg2, label [[BB14:%.*]], label [[LOOP_3HEADER]]
 ; CHECK:       bb14:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call double undef(ptr [[TMP]], ptr [[ARG:%.*]])
 ; CHECK-NEXT:    ret double undef
@@ -153,10 +153,10 @@ outer.header:                                              ; preds = %bb3, %bb
   br label %inner
 
 inner:
-  br i1 undef, label %outer.latch, label %inner
+  br i1 %arg2, label %outer.latch, label %inner
 
 outer.latch:                                              ; preds = %bb16
-  br i1 undef, label %bb, label %outer.header
+  br i1 %arg2, label %bb, label %outer.header
 
 bb:                                              ; preds = %bb3
   %tmp5 = load ptr, ptr undef, align 8
@@ -164,7 +164,7 @@ bb:                                              ; preds = %bb3
   br label %loop.3header
 
 loop.3header:                                              ; preds = %bb13, %bb4
-  br i1 undef, label %loop.3latch, label %bb9
+  br i1 %arg2, label %loop.3latch, label %bb9
 
 bb9:                                              ; preds = %bb8
   %tmp10 = getelementptr inbounds [3 x double], ptr %tmp5, i64 undef, i64 1
@@ -174,7 +174,7 @@ bb9:                                              ; preds = %bb8
   br label %loop.3latch
 
 loop.3latch:                                             ; preds = %bb11, %bb8
-  br i1 undef, label %bb14, label %loop.3header
+  br i1 %arg2, label %bb14, label %loop.3header
 
 bb14:                                             ; preds = %bb13
   %tmp15 = call double undef(ptr %tmp, ptr %arg)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no-scheduled-instructions.ll b/llvm/test/Transforms/SLPVectorizer/X86/no-scheduled-instructions.ll
index 43d8c1342cbbc..eea22c1861b44 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no-scheduled-instructions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no-scheduled-instructions.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -S -passes=slp-vectorizer -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s
 
-define void @test() {
+define void @test(i1 %arg) {
 ; CHECK-LABEL: define void @test
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK-SAME: (i1 %arg) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br i1 %arg, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>)
 ; CHECK-NEXT:    call void @f(i32 noundef [[TMP1]])
@@ -20,7 +20,7 @@ define void @test() {
   %i32 = extractelement <4 x i32> zeroinitializer, i64 1
   %i33 = extractelement <4 x i32> zeroinitializer, i64 2
   %i34 = extractelement <4 x i32> zeroinitializer, i64 3
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg, label %bb1, label %bb2
 
 bb1:
   %i11 = mul nsw i32 %i28, %i27
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll
index ad3ebf57ab7a4..f9d6c3eab350f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll
@@ -26,10 +26,10 @@ entry:
 declare ptr @objc_msgSend(ptr, ptr, ...)
 declare i32 @personality_v0(...)
 
-define void @invoketest() personality ptr @personality_v0 {
+define void @invoketest(i1 %arg) personality ptr @personality_v0 {
 ; CHECK-LABEL: @invoketest(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    [[CALL49:%.*]] = invoke double @objc_msgSend(ptr undef, ptr undef)
 ; CHECK-NEXT:    to label [[COND_TRUE54:%.*]] unwind label [[LPAD:%.*]]
@@ -43,7 +43,7 @@ define void @invoketest() personality ptr @personality_v0 {
 ; CHECK-NEXT:    [[CALL59:%.*]] = invoke double @objc_msgSend(ptr undef, ptr undef)
 ; CHECK-NEXT:    to label [[COND_END60]] unwind label [[LPAD]]
 ; CHECK:       cond.end60:
-; CHECK-NEXT:    br i1 undef, label [[IF_END98:%.*]], label [[IF_THEN63:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END98:%.*]], label [[IF_THEN63:%.*]]
 ; CHECK:       if.then63:
 ; CHECK-NEXT:    br label [[IF_END98]]
 ; CHECK:       lpad:
@@ -56,7 +56,7 @@ define void @invoketest() personality ptr @personality_v0 {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %cond.true, label %cond.false
+  br i1 %arg, label %cond.true, label %cond.false
 
 cond.true:
   %call49 = invoke double @objc_msgSend(ptr undef, ptr undef)
@@ -77,7 +77,7 @@ cond.false57:
 cond.end60:
   %cond126 = phi double [ %call49, %cond.true54 ], [ %call51, %cond.false57 ]
   %cond61 = phi double [ %call56, %cond.true54 ], [ %call59, %cond.false57 ]
-  br i1 undef, label %if.end98, label %if.then63
+  br i1 %arg, label %if.end98, label %if.then63
 
 if.then63:
   %conv69 = fptrunc double undef to float
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
index 20ad09a632826..4cdf9670394f1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -2,12 +2,12 @@
 ; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
 
 ; Function Attrs: nounwind uwtable
-define void @get_block(i32 %y_pos) local_unnamed_addr #0 {
+define void @get_block(i32 %y_pos, i1 %arg) local_unnamed_addr #0 {
 ; CHECK-LABEL: @get_block(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LAND_LHS_TRUE:%.*]]
 ; CHECK:       land.lhs.true:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
@@ -43,7 +43,7 @@ entry:
   br label %land.lhs.true
 
 land.lhs.true:                                    ; preds = %entry
-  br i1 undef, label %if.then, label %if.end
+  br i1 %arg, label %if.then, label %if.end
 
 if.then:                                          ; preds = %land.lhs.true
   unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 78bfb8df51aeb..df85656800aac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -277,13 +277,13 @@ for.end:                                          ; preds = %for.body
   ret float %add31
 }
 
-define void @test(ptr %i1, ptr %i2, ptr %o) {
+define void @test(ptr %i1, ptr %i2, ptr %o, i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16
 ; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1
 ; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16
-; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[THEN:%.*]], label [[END:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16
 ; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1
@@ -305,7 +305,7 @@ entry:
   %i1.0 = load x86_fp80, ptr %i1, align 16
   %i1.gep1 = getelementptr x86_fp80, ptr %i1, i64 1
   %i1.1 = load x86_fp80, ptr %i1.gep1, align 16
-  br i1 undef, label %then, label %end
+  br i1 %arg, label %then, label %end
 
 then:
   %i2.0 = load x86_fp80, ptr %i2, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
index 88ac2d9dc42d1..ddbe943e32446 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 target triple = "i386-apple-macosx10.9.0"
 
 
-define void @test(ptr %i1, ptr %i2, ptr %o) {
+define void @test(ptr %i1, ptr %i2, ptr %o, i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I1_0:%.*]] = load double, ptr [[I1:%.*]], align 16
@@ -14,7 +14,7 @@ define void @test(ptr %i1, ptr %i2, ptr %o) {
 ; CHECK-NEXT:    [[I1_1:%.*]] = load double, ptr [[I1_GEP1]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1
-; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[THEN:%.*]], label [[END:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[I2_0:%.*]] = load double, ptr [[I2:%.*]], align 16
 ; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, ptr [[I2]], i64 1
@@ -38,7 +38,7 @@ entry:
   %i1.0 = load double, ptr %i1, align 16
   %i1.gep1 = getelementptr double, ptr %i1, i64 1
   %i1.1 = load double, ptr %i1.gep1, align 16
-  br i1 undef, label %then, label %end
+  br i1 %arg, label %then, label %end
 
 then:
   %i2.0 = load double, ptr %i2, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll
index b61b004797959..733033efa3bfe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll
@@ -3,9 +3,9 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
 target triple = "i686-pc-win32"
 
-define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(char*,uint,double&)"() unnamed_addr {
+define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(char*,uint,double&)"(i1 %arg) unnamed_addr {
 "@0":
-  br i1 undef, label %"@38.lr.ph", label %"@37"
+  br i1 %arg, label %"@38.lr.ph", label %"@37"
 
 "@37":                                            ; preds = %"@38.lr.ph", %"@44", %"@0"
   ret void
@@ -13,10 +13,10 @@ define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(ch
 "@44":                                            ; preds = %"@38.lr.ph"
   %0 = add i64 undef, undef
   %1 = add i32 %mainPartDigits.loc.0.ph45, 1
-  br i1 undef, label %"@38.lr.ph", label %"@37"
+  br i1 %arg, label %"@38.lr.ph", label %"@37"
 
 "@38.lr.ph":                                      ; preds = %"@44", %"@0"
   %mainDoublePart.loc.0.ph46 = phi i64 [ %0, %"@44" ], [ 0, %"@0" ]
   %mainPartDigits.loc.0.ph45 = phi i32 [ %1, %"@44" ], [ 0, %"@0" ]
-  br i1 undef, label %"@44", label %"@37"
+  br i1 %arg, label %"@44", label %"@37"
 }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
index 540ea4eb659fe..ac8b2428a3dea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
@@ -3,13 +3,13 @@
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
 ; This type is not supported by SLP
-define i1 @test(ptr %i1, ptr %i2) {
+define i1 @test(ptr %i1, ptr %i2, i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, ptr [[I1:%.*]], align 16
 ; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, ptr [[I1]], i64 1
 ; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, ptr [[I1_GEP1]], align 16
-; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[THEN:%.*]], label [[END:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, ptr [[I2:%.*]], align 16
 ; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, ptr [[I2]], i64 1
@@ -25,7 +25,7 @@ entry:
   %i1.0 = load x86_fp80, ptr %i1, align 16
   %i1.gep1 = getelementptr x86_fp80, ptr %i1, i64 1
   %i1.1 = load x86_fp80, ptr %i1.gep1, align 16
-  br i1 undef, label %then, label %end
+  br i1 %arg, label %then, label %end
 then:
   %i2.0 = load x86_fp80, ptr %i2, align 16
   %i2.gep1 = getelementptr inbounds x86_fp80, ptr %i2, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
index f9815529a2375..5f2199aef17ee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -3,10 +3,10 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @hoge() {
+define void @hoge(i1 %arg) {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb2:
@@ -57,7 +57,7 @@ define void @hoge() {
 ; CHECK-NEXT:    unreachable
 ;
 bb:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg, label %bb1, label %bb2
 
 bb1:                                              ; preds = %bb
   ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
index 69b4639d9c131..2e6df0007e08b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
@@ -42,16 +42,16 @@
 ; comment out reorderTopToBottom() and remove the stores.
 
 
-define void @reorder_crash(ptr %ptr) {
+define void @reorder_crash(ptr %ptr, i1 %arg) {
 ; CHECK-LABEL: @reorder_crash(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB0:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb12:
-; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[PTR]], align 4
@@ -69,7 +69,7 @@ entry:
   %gep1 = getelementptr inbounds float, ptr %ptr, i64 1
   %gep2 = getelementptr inbounds float, ptr %ptr, i64 2
   %gep3 = getelementptr inbounds float, ptr %ptr, i64 3
-  br i1 undef, label %bb0, label %bb12
+  br i1 %arg, label %bb0, label %bb12
 
 bb0:
   ; Used by phi in this order: 1, 0, 2, 3
@@ -86,7 +86,7 @@ bb0:
   br label %bb3
 
 bb12:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg, label %bb1, label %bb2
 
 bb1:
   ; Used by phi in this order: 1, 0, 2, 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index cb955ff91ed81..ef1149a108e29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -4,10 +4,10 @@
 %"struct.std::array" = type { [32 x i8] }
 
 ; Function Attrs: nounwind uwtable
-define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
+define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(i1 %arg) unnamed_addr #0 align 2 {
 ; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
 ; CHECK:       if.then22.i:
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
@@ -36,7 +36,7 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %if.end50.i, label %if.then22.i
+  br i1 %arg, label %if.end50.i, label %if.then22.i
 
 if.then22.i:                                      ; preds = %entry
   %sub.i = add nsw i32 undef, -1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
index 79ce74bd21dbc..b900bd3a8c331 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 %struct.S = type { [3 x float], [3 x float], [4 x float] }
 
-define i32 @foo(i32 %0, ptr %1, ptr %2)  {
+define i32 @foo(i32 %0, ptr %1, ptr %2, i1 %arg)  {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[T4:%.*]] = alloca [[STRUCT_S:%.*]], align 8
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[T4]], i64 0, i32 1
@@ -19,7 +19,7 @@ define i32 @foo(i32 %0, ptr %1, ptr %2)  {
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[T4]], i64 0, i32 2, i64 0
 ; CHECK-NEXT:    store <4 x float> [[SHUFFLE]], ptr [[T21]], align 4
 ; CHECK-NEXT:    [[T89]] = load <2 x float>, ptr [[T9]], align 4
-; CHECK-NEXT:    br i1 undef, label [[T37]], label [[T55:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[T37]], label [[T55:%.*]]
 ; CHECK:       t55:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -49,7 +49,7 @@ t37:
   %t89 = load <2 x float>, ptr %t9, align 4
   %x23 = extractelement <2 x float> %t89, i32 0
   %x24 = extractelement <2 x float> %t89, i32 1
-  br i1 undef, label %t37, label %t55
+  br i1 %arg, label %t37, label %t55
 
 t55:
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
index edd1a2a3a2fff..364b0f4c1a3a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
@@ -12,10 +12,10 @@
 ; iteration (it was matched and vectorized, which added a use of a deleted
 ; instruction)
 
-define void @test() {
+define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_END:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
 ; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 2
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 3
@@ -35,7 +35,7 @@ define void @test() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %if.end, label %for.cond.preheader
+  br i1 %arg, label %if.end, label %for.cond.preheader
 
 for.cond.preheader:                               ; preds = %entry
   %i = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
index eb5e218f057ce..ac9454967719e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; as elements of the vectorized tree.
 ; PR19621
 
-define void @test() {
+define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
 ; CHECK-NEXT:    br label [[BB283:%.*]]
@@ -22,13 +22,13 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
-; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK-NEXT:    br i1 %arg, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
-; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK-NEXT:    br i1 %arg, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], <double undef, double 0.000000e+00>
@@ -57,7 +57,7 @@ bb284:
   br label %bb21.i
 
 bb21.i:
-  br i1 undef, label %bb22.i, label %exit
+  br i1 %arg, label %bb22.i, label %exit
 
 bb22.i:
   %tmp24.i = fadd double undef, %tmp9.i
@@ -67,7 +67,7 @@ bb22.i:
 bb32.i:
   %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
   %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
-  br i1 undef, label %bb32.i, label %bb21.i
+  br i1 %arg, label %bb32.i, label %bb21.i
 
 exit:
   %tmp303 = fpext float %Av.sroa.0.0 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
index f870cb44f4e5f..c5cdcdc1eb1a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; as elements of the vectorized tree.
 ; PR19621
 
-define void @test() {
+define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
 ; CHECK-NEXT:    br label [[BB283:%.*]]
@@ -22,13 +22,13 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
-; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK-NEXT:    br i1 %arg, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
-; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK-NEXT:    br i1 %arg, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], <double undef, double 0.000000e+00>
@@ -57,7 +57,7 @@ bb284:
   br label %bb21.i
 
 bb21.i:
-  br i1 undef, label %bb22.i, label %exit
+  br i1 %arg, label %bb22.i, label %exit
 
 bb22.i:
   %tmp24.i = fadd double undef, %tmp9.i
@@ -67,7 +67,7 @@ bb22.i:
 bb32.i:
   %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
   %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
-  br i1 undef, label %bb32.i, label %bb21.i
+  br i1 %arg, label %bb32.i, label %bb21.i
 
 exit:
   %tmp303 = fpext float %Av.sroa.0.0 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index 17f9f371ff6ef..a69849fabcef6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -mcpu=cascadelake -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
-define void @foo() {
+define void @foo(i1 %arg) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = uitofp i16 undef to float
@@ -14,7 +14,7 @@ define void @foo() {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
-; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
@@ -46,7 +46,7 @@ bb2:
   %2 = phi float [ undef, %bb1 ], [ %11, %bb3 ]
   %3 = phi float [ undef, %bb1 ], [ %12, %bb3 ]
   %4 = load double, ptr undef, align 8
-  br i1 undef, label %bb3, label %bb4
+  br i1 %arg, label %bb3, label %bb4
 
 bb4:
   %ext = fpext float %3 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index b160c0174c0a7..ce13f478d3811 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -447,3 +447,37 @@ for.end.loopexit:
   store <4 x i32> %4, ptr %out2, align 4
   ret void
 }
+
+define void @test14(<8 x i1> %0) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v8i1(<16 x i1> poison, <8 x i1> [[TMP0:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <16 x i16> [ [[TMP6]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.vector.extract.v4i16.v16i16(<16 x i16> [[TMP7]], i64 12)
+; CHECK-NEXT:    [[OR0:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sext0 = sext <8 x i1> %0 to <8 x i16>
+  %sext1 = sext <8 x i1> %0 to <8 x i16>
+  %1 = shufflevector <8 x i16> %sext0, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <8 x i16> %sext0, <8 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = shufflevector <8 x i16> %sext1, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <8 x i16> %sext1, <8 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  br label %for.end.loopexit
+
+for.end.loopexit:
+  %phi0 = phi <4 x i16> [ %1, %entry ]
+  %phi1 = phi <4 x i16> [ %2, %entry ]
+  %phi2 = phi <4 x i16> [ %3, %entry ]
+  %phi3 = phi <4 x i16> [ %4, %entry ]
+  %or0 = or <4 x i16> %phi1, zeroinitializer
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 9d6371b13e08a..a728515f0fdc3 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -7,10 +7,10 @@
 ; to UMax and thus same reduction kind is returned.
 ; The routine's later code merely assumes the instruction to be a select.
 
-define dso_local void @test() {
+define dso_local void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 undef, label [[NEXT:%.*]], label [[THEN:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[NEXT:%.*]], label [[THEN:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 0, i8 undef)
 ; CHECK-NEXT:    [[SELCMP:%.*]] = icmp ult i8 [[UM]], undef
@@ -21,7 +21,7 @@ define dso_local void @test() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %next, label %then
+  br i1 %arg, label %next, label %then
 
 then:
   %um = call i8 @llvm.umax.i8(i8 0, i8 undef)
diff --git a/llvm/test/Transforms/Scalarizer/crash-bug.ll b/llvm/test/Transforms/Scalarizer/crash-bug.ll
index 2195a37564c6a..692fc07e3ea5c 100644
--- a/llvm/test/Transforms/Scalarizer/crash-bug.ll
+++ b/llvm/test/Transforms/Scalarizer/crash-bug.ll
@@ -3,14 +3,14 @@
 
 ; Don't crash
 
-define void @foo() {
-; CHECK-LABEL: define void @foo() {
+define void @foo(i1 %arg) {
+; CHECK-LABEL: define void @foo(i1 %arg) {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[BB2_VEC_I1:%.*]] = phi i16 [ 200, [[TMP0:%.*]] ], [ [[BB2_VEC_I1]], [[BB2:%.*]] ]
-; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB2]]
+; CHECK-NEXT:    br i1 %arg, label [[BB3:%.*]], label [[BB2]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
@@ -22,7 +22,7 @@ bb2:                                        ; preds = %bb1
 
 bb1:                                        ; preds = %bb2, %0
   %bb1_vec = phi <2 x i16> [ <i16 100, i16 200>, %0 ], [ %bb2_vec, %bb2 ]
-  br i1 undef, label %bb3, label %bb2
+  br i1 %arg, label %bb3, label %bb2
 
 bb3:
   ret void
diff --git a/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
index e5ba35ca2c4ee..4f4dff84e1294 100644
--- a/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/Scalarizer/dbgloc-bug-inseltpoison.ll
@@ -7,7 +7,7 @@ define i16 @f1() !dbg !5 {
   ret i16 undef, !dbg !9
 }
 
-define void @f2() !dbg !10 {
+define void @f2(i1 %arg) !dbg !10 {
 bb1:
   %_tmp7 = tail call i16 @f1(), !dbg !13
 ; CHECK: call i16 @f1(), !dbg !13
@@ -16,7 +16,7 @@ bb1:
   br label %vector.body
 
 vector.body:
-  br i1 undef, label %middle.block, label %vector.body
+  br i1 %arg, label %middle.block, label %vector.body
 
 middle.block:
   ret void, !dbg !15
diff --git a/llvm/test/Transforms/Scalarizer/dbgloc-bug.ll b/llvm/test/Transforms/Scalarizer/dbgloc-bug.ll
index 639fd3d7c0957..3dd768e593eb4 100644
--- a/llvm/test/Transforms/Scalarizer/dbgloc-bug.ll
+++ b/llvm/test/Transforms/Scalarizer/dbgloc-bug.ll
@@ -7,7 +7,7 @@ define i16 @f1() !dbg !5 {
   ret i16 undef, !dbg !9
 }
 
-define void @f2() !dbg !10 {
+define void @f2(i1 %arg) !dbg !10 {
 bb1:
   %_tmp7 = tail call i16 @f1(), !dbg !13
 ; CHECK: call i16 @f1(), !dbg !13
@@ -16,7 +16,7 @@ bb1:
   br label %vector.body
 
 vector.body:
-  br i1 undef, label %middle.block, label %vector.body
+  br i1 %arg, label %middle.block, label %vector.body
 
 middle.block:
   ret void, !dbg !15
diff --git a/llvm/test/Transforms/Scalarizer/phi-unreachable-pred.ll b/llvm/test/Transforms/Scalarizer/phi-unreachable-pred.ll
index 3b6f5ccf3d3ea..8ce912ff55d6e 100644
--- a/llvm/test/Transforms/Scalarizer/phi-unreachable-pred.ll
+++ b/llvm/test/Transforms/Scalarizer/phi-unreachable-pred.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes='function(scalarizer)' -S -o - | FileCheck %s
 
-define i16 @f1() {
+define i16 @f1(i1 %arg) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -9,7 +9,7 @@ define i16 @f1() {
 ; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <4 x i16> [[INSERT]], i16 ptrtoint (ptr @f1 to i16), i32 0
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY:%.*]], label [[FOR_END]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY:%.*]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[PHI_I0:%.*]] = phi i16 [ 1, [[ENTRY:%.*]] ], [ poison, [[FOR_COND]] ]
 ; CHECK-NEXT:    ret i16 [[PHI_I0]]
@@ -22,7 +22,7 @@ for.body:
   br label %for.cond
 
 for.cond:
-  br i1 undef, label %for.body, label %for.end
+  br i1 %arg, label %for.body, label %for.end
 
 for.end:
   ; opt used to hang when scalarizing this code. When scattering %insert we
@@ -34,22 +34,22 @@ for.end:
   ret i16 %extract
 }
 
-define void @f2() {
+define void @f2(i1 %arg) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_END8:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN:%.*]], label [[IF_END8:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       for.body2:
-; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_INC:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_END:%.*]], label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[E_SROA_3_2:%.*]] = phi <2 x i64> [ splat (i64 1), [[FOR_END]] ], [ [[E_SROA_3_2]], [[FOR_BODY2:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 6, [[FOR_END]] ], [ [[TMP0]], [[FOR_BODY2]] ]
-; CHECK-NEXT:    br i1 undef, label [[FOR_BODY2]], label [[FOR_COND1_FOR_END7_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY2]], label [[FOR_COND1_FOR_END7_CRIT_EDGE:%.*]]
 ; CHECK:       for.cond1.for.end7_crit_edge:
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       if.end8:
@@ -61,13 +61,13 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %if.end8, %entry
-  br i1 undef, label %if.then, label %if.end8
+  br i1 %arg, label %if.then, label %if.end8
 
 if.then:                                          ; preds = %for.body
   br label %if.end8
 
 for.body2:                                        ; preds = %for.inc
-  br i1 undef, label %for.end, label %for.inc
+  br i1 %arg, label %for.end, label %for.inc
 
 for.end:                                          ; preds = %for.body2
   br label %for.inc
@@ -75,7 +75,7 @@ for.end:                                          ; preds = %for.body2
 for.inc:                                          ; preds = %for.end, %for.body2
   %e.sroa.3.2 = phi <2 x i64> [ <i64 1, i64 1>, %for.end ], [ %e.sroa.3.2, %for.body2 ]
   %0 = phi i32 [ 6, %for.end ], [ %0, %for.body2 ]
-  br i1 undef, label %for.body2, label %for.cond1.for.end7_crit_edge
+  br i1 %arg, label %for.body2, label %for.cond1.for.end7_crit_edge
 
 for.cond1.for.end7_crit_edge:                     ; preds = %for.inc
   br label %if.end8
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
index d800fca4727e3..ab95b523744be 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes="loop-mssa(simple-loop-unswitch)" -verify-memoryssa -disable-output < %s
 ; PR10031
 
-define i32 @test(i32 %command) {
+define i32 @test(i32 %command, i1 %arg) {
 entry:
   br label %tailrecurse
 
 tailrecurse:                                      ; preds = %if.then14, %tailrecurse, %entry
-  br i1 undef, label %if.then, label %tailrecurse
+  br i1 %arg, label %if.then, label %tailrecurse
 
 if.then:                                          ; preds = %tailrecurse
   switch i32 %command, label %sw.bb [
@@ -15,7 +15,7 @@ if.then:                                          ; preds = %tailrecurse
   ]
 
 land.lhs.true:                                    ; preds = %if.then, %if.then
-  br i1 undef, label %sw.bb, label %if.then14
+  br i1 %arg, label %sw.bb, label %if.then14
 
 if.then14:                                        ; preds = %land.lhs.true
   switch i32 %command, label %tailrecurse [
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll
index 8ad869015f44e..229bbb2f3929f 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-30-LoopUnswitch-LPad-Crash.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 %class.B.21.41.65.101.137.157.177.197.237.241.245.249.261.293.301.337.345.378 = type { ptr }
 %class.A.20.40.64.100.136.156.176.196.236.240.244.248.260.292.300.336.344.377 = type { i8 }
 
-define void @_Z23get_reconstruction_pathv() uwtable ssp personality ptr @__gxx_personality_v0 {
+define void @_Z23get_reconstruction_pathv(i1 %arg) uwtable ssp personality ptr @__gxx_personality_v0 {
 entry:
   %c = alloca %class.D.22.42.66.102.138.158.178.198.238.242.246.250.262.294.302.338.346.379, align 8
   br label %for.cond
@@ -30,7 +30,7 @@ invoke.cont4:                                     ; preds = %for.cond3
           to label %invoke.cont6 unwind label %lpad
 
 invoke.cont6:                                     ; preds = %invoke.cont4
-  br i1 undef, label %for.cond3, label %for.end
+  br i1 %arg, label %for.cond3, label %for.end
 
 lpad:                                             ; preds = %for.end, %invoke.cont4, %for.cond3, %invoke.cont, %for.cond
   %0 = landingpad { ptr, i32 }
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
index 60608e8df0868..2d24ef4afab07 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
@@ -6,12 +6,12 @@
 
 target datalayout = "e-m:e-p:16:16-p1:32:16-i32:16-i64:16-n8:16"
 
-define void @foo() {
+define void @foo(i1 %arg) {
 ; CHECK-LABEL: @foo
 entry:
   %arrayidx.i1 = getelementptr inbounds i16, ptr undef, i16 undef
   %arrayidx.i = addrspacecast ptr %arrayidx.i1 to ptr addrspace(1)
-  br i1 undef, label %for.body.i, label %bar.exit
+  br i1 %arg, label %for.body.i, label %bar.exit
 
 for.body.i:                                       ; preds = %for.body.i, %entry
 ; When we call makeLoopInvariant (i.e. trivial LICM) on this load, it
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll b/llvm/test/Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
index 9ca554023a8cf..c120eeb440d19 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/delete-dead-blocks.ll
@@ -47,7 +47,7 @@ get_out2:
 ;
 ; This comes from PR38778
 ; CHECK-LABEL: @Test2
-define void @Test2(i32) {
+define void @Test2(i32, i1 %arg) {
 header:
   br label %loop
 loop:
@@ -58,12 +58,12 @@ loop:
   ]
 ; CHECK-NOT: {{^}}guarded1:
 guarded1:
-  br i1 undef, label %continue, label %leave
+  br i1 %arg, label %continue, label %leave
 guarded2:
   br label %continue
 check:
   %val = add i32 0, 1
-  br i1 undef, label %continue, label %leave
+  br i1 %arg, label %continue, label %leave
 continue:
   br label %loop
 leave:
@@ -75,7 +75,7 @@ leave:
 ; Yet another test from PR38778
 ;
 ; CHECK-LABEL: @Test3
-define void @Test3(i32) {
+define void @Test3(i32, i1 %arg) {
 header:
   br label %outer
 outer:
@@ -95,7 +95,7 @@ case2:
   br label %continue
 continue:
   %local_11_92 = phi i32 [ 0, %switchme ], [ 18, %case2 ], [ 0, %overflow ]
-  br i1 undef, label %outer, label %inner
+  br i1 %arg, label %outer, label %inner
 go_out:
   unreachable
 }
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/formDedicatedAfterTrivial1.ll b/llvm/test/Transforms/SimpleLoopUnswitch/formDedicatedAfterTrivial1.ll
index 19a2bc3ad0449..45da85299051f 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/formDedicatedAfterTrivial1.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/formDedicatedAfterTrivial1.ll
@@ -2,7 +2,7 @@
 
 ; PR38283
 ; PR38737
-define void @f1() {
+define void @f1(i1 %arg) {
 for.cond1thread-pre-split.lr.ph.lr.ph:
   %tobool4 = icmp eq i16 undef, 0
   br label %for.cond1thread-pre-split
@@ -18,7 +18,7 @@ if.end:                                           ; preds = %for.body2
   br i1 %tobool4, label %if.end6, label %for.cond1thread-pre-split
 
 if.end6:                                          ; preds = %if.end
-  br i1 undef, label %for.body2, label %for.end
+  br i1 %arg, label %for.body2, label %for.end
 
 for.end:                                          ; preds = %if.end6, %for.body2
   ret void
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 07668616ff86d..533b1f691f5ad 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards  -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s
@@ -131,30 +132,55 @@ exit:
   ret void
 }
 
-define void @test_nested_loop(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_nested_loop(
+define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
+; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT:%.*]], label [[OUTER_LOOP_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 %cond, label %entry.split, label %outer_loop.split
 ; CHECK:       entry.split:
-; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECK-NEXT:    br i1 %arg, label %entry.split.split.us, label %entry.split.split
+; CHECK:       entry.split.split.us:
+; CHECK-NEXT:    br label %outer_loop.us
+; CHECK:       outer_loop.us:
+; CHECK-NEXT:    br label %outer_loop.split.us.us
+; CHECK:       outer_backedge.us:
+; CHECK-NEXT:    br label %outer_loop.us
+; CHECK:       outer_loop.split.us.us:
+; CHECK-NEXT:    br label %loop.us.us
+; CHECK:       loop.us.us:
+; CHECK-NEXT:    %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ]
+; CHECK-NEXT:    br label %guarded.us.us
+; CHECK:       guarded.us.us:
+; CHECK-NEXT:    %iv.next.us.us = add i32 %iv.us.us, 1
+; CHECK-NEXT:    %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N
+; CHECK-NEXT:    br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us
+; CHECK:       outer_backedge.split.us.us:
+; CHECK-NEXT:    br label %outer_backedge.us
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    br label %outer_loop
 ; CHECK:       outer_loop:
-; CHECK-NEXT:    br label [[OUTER_LOOP_SPLIT_US:%.*]]
+; CHECK-NEXT:    br label %outer_loop.split.us
 ; CHECK:       outer_loop.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK-NEXT:    br label %loop.us
 ; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK-NEXT:    %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ]
+; CHECK-NEXT:    br label %guarded.us
 ; CHECK:       guarded.us:
-; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[OUTER_BACKEDGE_SPLIT_US:%.*]]
+; CHECK-NEXT:    %iv.next.us = add i32 %iv.us, 1
+; CHECK-NEXT:    %loop.cond.us = icmp slt i32 %iv.next.us, %N
+; CHECK-NEXT:    br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us
 ; CHECK:       outer_backedge.split.us:
-; CHECK-NEXT:    br label [[OUTER_BACKEDGE:%.*]]
+; CHECK-NEXT:    br label %outer_backedge
+; CHECK:       outer_loop.split:
+; CHECK-NEXT:    br label %loop
+; CHECK:       loop:
+; CHECK-NEXT:    br label %deopt
 ; CHECK:       deopt:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       outer_backedge:
-; CHECK-NEXT:    br i1 false, label [[OUTER_LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br label %exit
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -171,7 +197,7 @@ loop:
   br i1 %loop.cond, label %loop, label %outer_backedge
 
 outer_backedge:
-  br i1 undef, label %outer_loop, label %exit
+  br i1 %arg, label %outer_loop, label %exit
 
 exit:
   ret void
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
index 099d6a5456e8e..7c9bb1b8520d6 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
@@ -29,7 +29,7 @@ if.end:
   br label %for.inc
 
 for.inc:
-  br i1 undef, label %for.body, label %for.end
+  br i1 false, label %for.body, label %for.end
 
 for.end:
   ret void
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
index cde0d9baf7599..51cce1c3e571b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
@@ -74,13 +74,13 @@ bb10:                                             ; preds = %bb8, %bb
 ; This is a simplified form of ineqn from above. It triggers some
 ; different cases in the loop-unswitch code.
 
-define void @simplified_ineqn() nounwind readonly {
+define void @simplified_ineqn(i1 %arg) nounwind readonly {
 entry:
   br label %bb8.outer
 
 bb8.outer:                                        ; preds = %bb6, %bb2, %entry
   %x = phi i32 [ 0, %entry ], [ 0, %bb6 ], [ 1, %bb2 ] ; <i32> [#uses=1]
-  br i1 undef, label %return, label %bb2
+  br i1 %arg, label %return, label %bb2
 
 bb2:                                              ; preds = %bb
   switch i32 %x, label %bb6 [
@@ -88,7 +88,7 @@ bb2:                                              ; preds = %bb
   ]
 
 bb6:                                              ; preds = %bb2
-  br i1 undef, label %bb8.outer, label %bb2
+  br i1 %arg, label %bb8.outer, label %bb2
 
 return:                                             ; preds = %bb8, %bb
   ret void
@@ -97,17 +97,17 @@ return:                                             ; preds = %bb8, %bb
 ; This function requires special handling to preserve LCSSA form.
 ; PR4934
 
-define void @pnp_check_irq() nounwind noredzone {
+define void @pnp_check_irq(i1 %arg) nounwind noredzone {
 entry:
   %conv56 = trunc i64 undef to i32                ; <i32> [#uses=1]
   br label %while.cond.i
 
 while.cond.i:                                     ; preds = %while.cond.i.backedge, %entry
   %call.i25 = call ptr @pci_get_device() nounwind noredzone ; <ptr> [#uses=2]
-  br i1 undef, label %if.then65, label %while.body.i
+  br i1 %arg, label %if.then65, label %while.body.i
 
 while.body.i:                                     ; preds = %while.cond.i
-  br i1 undef, label %if.then31.i.i, label %while.cond.i.backedge
+  br i1 %arg, label %if.then31.i.i, label %while.cond.i.backedge
 
 while.cond.i.backedge:                            ; preds = %if.then31.i.i, %while.body.i
   br label %while.cond.i
diff --git a/llvm/test/Transforms/SimplifyCFG/2004-12-10-SimplifyCFGCrash.ll b/llvm/test/Transforms/SimplifyCFG/2004-12-10-SimplifyCFGCrash.ll
index dbd8f7ab862d0..0656811b39a9e 100644
--- a/llvm/test/Transforms/SimplifyCFG/2004-12-10-SimplifyCFGCrash.ll
+++ b/llvm/test/Transforms/SimplifyCFG/2004-12-10-SimplifyCFGCrash.ll
@@ -1,36 +1,36 @@
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -disable-output
 
-define void @symhash_add() {
+define void @symhash_add(i1 %arg) {
 entry:
-	br i1 undef, label %then.0, label %UnifiedReturnBlock
+	br i1 %arg, label %then.0, label %UnifiedReturnBlock
 then.0:		; preds = %entry
-	br i1 undef, label %loopentry.2, label %loopentry.1.preheader
+	br i1 %arg, label %loopentry.2, label %loopentry.1.preheader
 loopentry.1.preheader:		; preds = %then.0
 	br label %loopentry.1.outer
 loopentry.1.outer:		; preds = %loopexit.1, %loopentry.1.preheader
 	br label %loopentry.1
 loopentry.1:		; preds = %endif.1, %then.4, %then.3, %then.1, %loopentry.1.outer
-	br i1 undef, label %loopexit.1, label %no_exit.1
+	br i1 %arg, label %loopexit.1, label %no_exit.1
 no_exit.1:		; preds = %loopentry.1
-	br i1 undef, label %then.1, label %else.0
+	br i1 %arg, label %then.1, label %else.0
 then.1:		; preds = %no_exit.1
 	br label %loopentry.1
 else.0:		; preds = %no_exit.1
-	br i1 undef, label %then.2, label %else.1
+	br i1 %arg, label %then.2, label %else.1
 then.2:		; preds = %else.0
-	br i1 undef, label %then.3, label %endif.1
+	br i1 %arg, label %then.3, label %endif.1
 then.3:		; preds = %then.2
 	br label %loopentry.1
 else.1:		; preds = %else.0
-	br i1 undef, label %endif.1, label %then.4
+	br i1 %arg, label %endif.1, label %then.4
 then.4:		; preds = %else.1
 	br label %loopentry.1
 endif.1:		; preds = %else.1, %then.2
 	br label %loopentry.1
 loopexit.1:		; preds = %loopentry.1
-	br i1 undef, label %loopentry.1.outer, label %loopentry.2
+	br i1 %arg, label %loopentry.1.outer, label %loopentry.2
 loopentry.2:		; preds = %no_exit.2, %loopexit.1, %then.0
-	br i1 undef, label %loopexit.2, label %no_exit.2
+	br i1 %arg, label %loopexit.2, label %no_exit.2
 no_exit.2:		; preds = %loopentry.2
 	br label %loopentry.2
 loopexit.2:		; preds = %loopentry.2
diff --git a/llvm/test/Transforms/SimplifyCFG/2006-06-12-InfLoop.ll b/llvm/test/Transforms/SimplifyCFG/2006-06-12-InfLoop.ll
index 0ec88ed071fc2..20c03e05ba0b9 100644
--- a/llvm/test/Transforms/SimplifyCFG/2006-06-12-InfLoop.ll
+++ b/llvm/test/Transforms/SimplifyCFG/2006-06-12-InfLoop.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -disable-output
 ; END.
 
-define void @main(i32 %c) {
+define void @main(i32 %c, i1 %arg) {
 entry:
 	%tmp.9 = icmp eq i32 %c, 2		; <i1> [#uses=1]
 	br i1 %tmp.9, label %endif.0, label %then.0
@@ -186,7 +186,7 @@ no_exit.0.i31.preheader:		; preds = %loopentry.1.i30
 no_exit.0.i31:		; preds = %loopexit.3.i, %no_exit.0.i31.preheader
 	br i1 false, label %then.1.i, label %else.0.i
 then.1.i:		; preds = %no_exit.0.i31
-	br i1 undef, label %then.0.i29, label %loopentry.0.i31
+	br i1 %arg, label %then.0.i29, label %loopentry.0.i31
 then.0.i29:		; preds = %then.1.i
 	unreachable
 loopentry.0.i31:		; preds = %then.1.i
@@ -194,13 +194,13 @@ loopentry.0.i31:		; preds = %then.1.i
 no_exit.0.i38.preheader:		; preds = %loopentry.0.i31
 	br label %no_exit.0.i38
 no_exit.0.i38:		; preds = %no_exit.0.i38, %no_exit.0.i38.preheader
-	br i1 undef, label %no_exit.0.i38, label %loopentry.1.i.preheader.loopexit
+	br i1 %arg, label %no_exit.0.i38, label %loopentry.1.i.preheader.loopexit
 loopentry.1.i.preheader.loopexit:		; preds = %no_exit.0.i38
 	br label %loopentry.1.i.preheader
 loopentry.1.i.preheader:		; preds = %loopentry.1.i.preheader.loopexit, %loopentry.0.i31
 	br label %loopentry.1.i
 loopentry.1.i:		; preds = %endif.2.i, %loopentry.1.i.preheader
-	br i1 undef, label %loopentry.2.i39.preheader, label %loopexit.1.i79.loopexit2
+	br i1 %arg, label %loopentry.2.i39.preheader, label %loopexit.1.i79.loopexit2
 loopentry.2.i39.preheader:		; preds = %loopentry.1.i
 	br label %loopentry.2.i39
 loopentry.2.i39:		; preds = %loopexit.5.i77, %loopentry.2.i39.preheader
@@ -212,45 +212,45 @@ loopentry.3.i40:		; preds = %loopexit.3.i51, %loopentry.3.i40.preheader
 no_exit.3.preheader.i42:		; preds = %loopentry.3.i40
 	br label %no_exit.3.i49
 no_exit.3.i49:		; preds = %no_exit.3.i49, %no_exit.3.preheader.i42
-	br i1 undef, label %no_exit.3.i49, label %loopexit.3.i51.loopexit
+	br i1 %arg, label %no_exit.3.i49, label %loopexit.3.i51.loopexit
 loopexit.3.i51.loopexit:		; preds = %no_exit.3.i49
 	br label %loopexit.3.i51
 loopexit.3.i51:		; preds = %loopexit.3.i51.loopexit, %loopentry.3.i40
-	br i1 undef, label %loopentry.3.i40, label %loopentry.4.i52
+	br i1 %arg, label %loopentry.3.i40, label %loopentry.4.i52
 loopentry.4.i52:		; preds = %loopexit.3.i51
 	br i1 false, label %no_exit.4.i54.preheader, label %hamming.exit.i71
 no_exit.4.i54.preheader:		; preds = %loopentry.4.i52
 	br label %no_exit.4.i54
 no_exit.4.i54:		; preds = %no_exit.4.backedge.i, %no_exit.4.i54.preheader
-	br i1 undef, label %then.1.i55, label %endif.1.i56
+	br i1 %arg, label %then.1.i55, label %endif.1.i56
 then.1.i55:		; preds = %no_exit.4.i54
-	br i1 undef, label %no_exit.4.backedge.i, label %loopexit.4.i57
+	br i1 %arg, label %no_exit.4.backedge.i, label %loopexit.4.i57
 no_exit.4.backedge.i:		; preds = %endif.1.i56, %then.1.i55
 	br label %no_exit.4.i54
 endif.1.i56:		; preds = %no_exit.4.i54
-	br i1 undef, label %no_exit.4.backedge.i, label %loopexit.4.i57
+	br i1 %arg, label %no_exit.4.backedge.i, label %loopexit.4.i57
 loopexit.4.i57:		; preds = %endif.1.i56, %then.1.i55
 	br i1 false, label %no_exit.i.i69.preheader, label %hamming.exit.i71
 no_exit.i.i69.preheader:		; preds = %loopexit.4.i57
 	br label %no_exit.i.i69
 no_exit.i.i69:		; preds = %no_exit.i.i69, %no_exit.i.i69.preheader
-	br i1 undef, label %no_exit.i.i69, label %hamming.exit.i71.loopexit
+	br i1 %arg, label %no_exit.i.i69, label %hamming.exit.i71.loopexit
 hamming.exit.i71.loopexit:		; preds = %no_exit.i.i69
 	br label %hamming.exit.i71
 hamming.exit.i71:		; preds = %hamming.exit.i71.loopexit, %loopexit.4.i57, %loopentry.4.i52, %loopentry.2.i39
-	br i1 undef, label %endif.2.i, label %loopentry.5.i72
+	br i1 %arg, label %endif.2.i, label %loopentry.5.i72
 loopentry.5.i72:		; preds = %hamming.exit.i71
 	br i1 false, label %shortcirc_next.i74.preheader, label %loopexit.5.i77
 shortcirc_next.i74.preheader:		; preds = %loopentry.5.i72
 	br label %shortcirc_next.i74
 shortcirc_next.i74:		; preds = %no_exit.5.i76, %shortcirc_next.i74.preheader
-	br i1 undef, label %no_exit.5.i76, label %loopexit.5.i77.loopexit
+	br i1 %arg, label %no_exit.5.i76, label %loopexit.5.i77.loopexit
 no_exit.5.i76:		; preds = %shortcirc_next.i74
-	br i1 undef, label %shortcirc_next.i74, label %loopexit.5.i77.loopexit
+	br i1 %arg, label %shortcirc_next.i74, label %loopexit.5.i77.loopexit
 loopexit.5.i77.loopexit:		; preds = %no_exit.5.i76, %shortcirc_next.i74
 	br label %loopexit.5.i77
 loopexit.5.i77:		; preds = %loopexit.5.i77.loopexit, %loopentry.5.i72
-	br i1 undef, label %loopentry.2.i39, label %loopexit.1.i79.loopexit
+	br i1 %arg, label %loopentry.2.i39, label %loopexit.1.i79.loopexit
 endif.2.i:		; preds = %hamming.exit.i71
 	br label %loopentry.1.i
 loopexit.1.i79.loopexit:		; preds = %loopexit.5.i77
@@ -258,7 +258,7 @@ loopexit.1.i79.loopexit:		; preds = %loopexit.5.i77
 loopexit.1.i79.loopexit2:		; preds = %loopentry.1.i
 	br label %loopexit.1.i79
 loopexit.1.i79:		; preds = %loopexit.1.i79.loopexit2, %loopexit.1.i79.loopexit
-	br i1 undef, label %then.3.i, label %loopentry.6.i80
+	br i1 %arg, label %then.3.i, label %loopentry.6.i80
 then.3.i:		; preds = %loopexit.1.i79
 	br i1 false, label %no_exit.6.i82.preheader, label %run.exit
 loopentry.6.i80:		; preds = %loopexit.1.i79
@@ -266,7 +266,7 @@ loopentry.6.i80:		; preds = %loopexit.1.i79
 no_exit.6.i82.preheader:		; preds = %loopentry.6.i80, %then.3.i
 	br label %no_exit.6.i82
 no_exit.6.i82:		; preds = %no_exit.6.i82, %no_exit.6.i82.preheader
-	br i1 undef, label %no_exit.6.i82, label %run.exit.loopexit
+	br i1 %arg, label %no_exit.6.i82, label %run.exit.loopexit
 run.exit.loopexit:		; preds = %no_exit.6.i82
 	br label %run.exit
 run.exit:		; preds = %run.exit.loopexit, %loopentry.6.i80, %then.3.i
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-on-undef.ll b/llvm/test/Transforms/SimplifyCFG/branch-on-undef.ll
index bc42ae60730a7..582a38f4af12f 100644
--- a/llvm/test/Transforms/SimplifyCFG/branch-on-undef.ll
+++ b/llvm/test/Transforms/SimplifyCFG/branch-on-undef.ll
@@ -3,10 +3,10 @@
 
 declare void @foo(i32)
 
-define void @br_undef_simple() {
+define void @br_undef_simple(i1 %arg) {
 ; CHECK-LABEL: @br_undef_simple(
 ; CHECK-NEXT:    call void @foo(i32 0)
-; CHECK-NEXT:    br i1 undef, label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       if:
@@ -17,7 +17,7 @@ define void @br_undef_simple() {
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
   call void @foo(i32 0)
-  br i1 undef, label %if, label %else
+  br i1 %arg, label %if, label %else
 
 if:
   call void @foo(i32 1)
diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
index bd63e6658c9a6..9fb5164d44d45 100644
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
@@ -939,7 +939,7 @@ if.end.loopexit:
 }
 
 @f.b = external global i8, align 1
-define void @pr48450_3() {
+define void @pr48450_3(i1 %arg) {
 ; CHECK-LABEL: @pr48450_3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
@@ -979,7 +979,7 @@ for.cond.cleanup:
   br label %cleanup
 
 for.body4:
-  br i1 undef, label %if.then6, label %if.end7
+  br i1 %arg, label %if.then6, label %if.end7
 
 if.then6:
   br label %cleanup
diff --git a/llvm/test/Transforms/SimplifyCFG/pr34131.ll b/llvm/test/Transforms/SimplifyCFG/pr34131.ll
index 89415cc88aeb1..ed37a969aa47f 100644
--- a/llvm/test/Transforms/SimplifyCFG/pr34131.ll
+++ b/llvm/test/Transforms/SimplifyCFG/pr34131.ll
@@ -4,7 +4,7 @@
 ; Earlier version using auto-generated checks from utils/update_test_checks.py
 ; had bot problems though...
 
-define void @patatino() {
+define void @patatino(i1 %arg) {
 
 ; CHECK-LABEL: @patatino
 
@@ -12,16 +12,16 @@ define void @patatino() {
 bb1:                                              ; preds = %bb36, %0
   br label %bb2
 bb2:                                              ; preds = %bb3, %bb1
-  br i1 undef, label %bb4, label %bb3
+  br i1 %arg, label %bb4, label %bb3
 bb3:                                              ; preds = %bb4, %bb2
-  br i1 undef, label %bb2, label %bb5
+  br i1 %arg, label %bb2, label %bb5
 bb4:                                              ; preds = %bb2
   switch i32 undef, label %bb3 [
   ]
 bb5:                                              ; preds = %bb3
   br label %bb6
 bb6:                                              ; preds = %bb5
-  br i1 undef, label %bb7, label %bb9
+  br i1 %arg, label %bb7, label %bb9
 bb7:                                              ; preds = %bb6
   %tmp = or i64 undef, 1
   %tmp8 = icmp ult i64 %tmp, 0
@@ -58,17 +58,17 @@ bb27:                                             ; preds = %bb24
   %tmp29 = icmp ult i64 %tmp28, 0
   br i1 %tmp29, label %bb30, label %bb9
 bb30:                                             ; preds = %bb27
-  br i1 undef, label %bb31, label %bb9
+  br i1 %arg, label %bb31, label %bb9
 bb31:                                             ; preds = %bb30
-  br i1 undef, label %bb32, label %bb9
+  br i1 %arg, label %bb32, label %bb9
 bb32:                                             ; preds = %bb31
-  br i1 undef, label %bb33, label %bb9
+  br i1 %arg, label %bb33, label %bb9
 bb33:                                             ; preds = %bb32
-  br i1 undef, label %bb34, label %bb9
+  br i1 %arg, label %bb34, label %bb9
 bb34:                                             ; preds = %bb33
-  br i1 undef, label %bb35, label %bb9
+  br i1 %arg, label %bb35, label %bb9
 bb35:                                             ; preds = %bb34
-  br i1 undef, label %bb36, label %bb9
+  br i1 %arg, label %bb36, label %bb9
 bb36:                                             ; preds = %bb35
-  br i1 undef, label %bb1, label %bb10
+  br i1 %arg, label %bb1, label %bb10
 }
diff --git a/llvm/test/Transforms/Sink/dead-user.ll b/llvm/test/Transforms/Sink/dead-user.ll
index 91e61b43ca391..e63aa027a2db8 100644
--- a/llvm/test/Transforms/Sink/dead-user.ll
+++ b/llvm/test/Transforms/Sink/dead-user.ll
@@ -2,27 +2,26 @@
 ; Compiler should not be broken with a dead user.
 ; RUN: opt -passes=sink -S < %s | FileCheck %s
 
-define void @test(i16 %p1) {
-; CHECK-LABEL: define void @test(
-; CHECK-SAME: i16 [[P1:%.*]]) {
+define void @test(i16 %p1, i1 %arg) {
+; CHECK-LABEL: define void @test(i16 %p1, i1 %arg) {
 ; CHECK-NEXT:  bb.0:
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[P1]] to i32
-; CHECK-NEXT:    br i1 undef, label [[BB_1:%.*]], label [[BB_3:%.*]]
+; CHECK-NEXT:    %conv = sext i16 %p1 to i32
+; CHECK-NEXT:    br i1 %arg, label %bb.1, label %bb.3
 ; CHECK:       bb.1:
-; CHECK-NEXT:    br label [[BB_2:%.*]]
+; CHECK-NEXT:    br label %bb.2
 ; CHECK:       bb.2:
-; CHECK-NEXT:    [[AND_2:%.*]] = and i32 undef, [[CONV]]
-; CHECK-NEXT:    br label [[BB_2]]
+; CHECK-NEXT:    %and.2 = and i32 undef, %conv
+; CHECK-NEXT:    br label %bb.2
 ; CHECK:       bb.3:
-; CHECK-NEXT:    [[AND_3:%.*]] = and i32 undef, [[CONV]]
-; CHECK-NEXT:    br label [[BB_3]]
+; CHECK-NEXT:    %and.3 = and i32 undef, %conv
+; CHECK-NEXT:    br label %bb.3
 ; CHECK:       dead:
-; CHECK-NEXT:    [[AND_DEAD:%.*]] = and i32 undef, [[CONV]]
-; CHECK-NEXT:    br label [[DEAD:%.*]]
+; CHECK-NEXT:    %and.dead = and i32 undef, %conv
+; CHECK-NEXT:    br label %dead
 ;
 bb.0:
   %conv = sext i16 %p1 to i32
-  br i1 undef, label %bb.1, label %bb.3
+  br i1 %arg, label %bb.1, label %bb.3
 
 bb.1:                                             ; preds = %bb.0
   br label %bb.2
diff --git a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
index 8f82a16639744..5ee9fda470d1e 100644
--- a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
@@ -27,11 +27,11 @@ end:                                 ; preds = %notnull, %entry
   ret ptr %i6
 }
 
-define void @f(i32 %i) {
+define void @f(i32 %i, i1 %arg) {
 entry:
 ; CHECK-LABEL: @f(
 ; CHECK:  %a2 = add i32 %i, 0
-  br i1 undef, label %land.rhs, label %land.end
+  br i1 %arg, label %land.rhs, label %land.end
 
 land.rhs:                                         ; preds = %entry
 ; CHECK: land.rhs:
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll
index 364eb161e298a..eec67e67b540d 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-continue-phi.ll
@@ -1,24 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
 
-define void @test1() {
+define void @test1(i1 %arg) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    %arg.inv = xor i1 %arg, true
+; CHECK-NEXT:    br label %loop
 ; CHECK:       Flow:
-; CHECK-NEXT:    br label [[FLOW1:%.*]]
+; CHECK-NEXT:    br label %Flow1
 ; CHECK:       loop:
-; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[FLOW1]] ]
-; CHECK-NEXT:    [[CTR_NEXT:%.*]] = add i32 [[CTR]], 1
-; CHECK-NEXT:    br i1 undef, label [[LOOP_A:%.*]], label [[FLOW1]]
+; CHECK-NEXT:    %ctr = phi i32 [ 0, %entry ], [ %0, %Flow1 ]
+; CHECK-NEXT:    %ctr.next = add i32 %ctr, 1
+; CHECK-NEXT:    br i1 %arg.inv, label %loop.a, label %Flow1
 ; CHECK:       loop.a:
-; CHECK-NEXT:    br i1 undef, label [[LOOP_B:%.*]], label [[FLOW:%.*]]
+; CHECK-NEXT:    br i1 %arg.inv, label %loop.b, label %Flow
 ; CHECK:       loop.b:
-; CHECK-NEXT:    br label [[FLOW]]
+; CHECK-NEXT:    br label %Flow
 ; CHECK:       Flow1:
-; CHECK-NEXT:    [[TMP0]] = phi i32 [ [[CTR_NEXT]], [[FLOW]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    %0 = phi i32 [ %ctr.next, %Flow ], [ undef, %loop ]
+; CHECK-NEXT:    %1 = phi i1 [ false, %Flow ], [ true, %loop ]
+; CHECK-NEXT:    br i1 %1, label %exit, label %loop
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -28,10 +29,10 @@ entry:
 loop:
   %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop.a ], [ %ctr.next, %loop.b ]
   %ctr.next = add i32 %ctr, 1
-  br i1 undef, label %exit, label %loop.a
+  br i1 %arg, label %exit, label %loop.a
 
 loop.a:
-  br i1 undef, label %loop, label %loop.b
+  br i1 %arg, label %loop, label %loop.b
 
 loop.b:
   br label %loop
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vmv.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vmv.s
index 31178e8e238f1..e69b7fb38295e 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vmv.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/vmv.s
@@ -293,12 +293,12 @@ vfmv.f.s f7, v16
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      280
-# CHECK-NEXT: Total Cycles:      523
+# CHECK-NEXT: Total Cycles:      524
 # CHECK-NEXT: Total uOps:        280
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.54
-# CHECK-NEXT: IPC:               0.54
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.53
 # CHECK-NEXT: Block RThroughput: 512.0
 
 # CHECK:      Instruction Info:
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s
index 3e9dcff4e1c0a..99b72b1fabbae 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s
@@ -260,12 +260,12 @@ vmv8r.v	v8, v16
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      256
-# CHECK-NEXT: Total Cycles:      237
+# CHECK-NEXT: Total Cycles:      255
 # CHECK-NEXT: Total uOps:        256
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    1.08
-# CHECK-NEXT: IPC:               1.08
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 240.0
 
 # CHECK:      Instruction Info:
diff --git a/llvm/test/tools/llvm-objdump/ELF/AArch64/mattr.s b/llvm/test/tools/llvm-objdump/ELF/AArch64/mattr.s
index e236660770648..5fd77b579984a 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AArch64/mattr.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AArch64/mattr.s
@@ -1,6 +1,10 @@
 ## When --mattr and --mcpu are both empty, disassemble all known instructions.
 # RUN: llvm-mc -filetype=obj -triple=aarch64 -mattr=+all %s -o %t
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefixes=CHECK,ALL
+# RUN: llvm-mc -filetype=obj -triple=aarch64_be -mattr=+all %s -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefixes=CHECK,ALL
+# RUN: llvm-mc -filetype=obj -triple=aarch64_32 -mattr=+all %s -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s --check-prefixes=CHECK,ALL
 
 ## If --mattr or --mcpu is specified, don't default to --mattr=+all.
 # RUN: llvm-objdump -d --no-show-raw-insn --mattr=+v8a %t | FileCheck %s --check-prefixes=CHECK,UNKNOWN
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 86ba9193dff2d..246d5cfa05818 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2556,7 +2556,7 @@ static void disassembleObject(ObjectFile *Obj, bool InlineRelocs) {
   if (!MAttrs.empty()) {
     for (unsigned I = 0; I != MAttrs.size(); ++I)
       Features.AddFeature(MAttrs[I]);
-  } else if (MCPU.empty() && Obj->getArch() == llvm::Triple::aarch64) {
+  } else if (MCPU.empty() && Obj->makeTriple().isAArch64()) {
     Features.AddFeature("+all");
   }
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 2acf1cc34b2d8..1d9d7bcf76549 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -2967,8 +2967,10 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) {
   std::unique_ptr<ProfileSummary> PS(Builder.getSummary());
   bool IsIR = Reader->isIRLevelProfile();
   OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end");
-  if (IsIR)
+  if (IsIR) {
     OS << "  entry_first = " << Reader->instrEntryBBEnabled();
+    OS << "  instrument_loop_entries = " << Reader->instrLoopEntriesEnabled();
+  }
   OS << "\n";
   if (ShowAllFunctions || !FuncNameFilter.empty())
     OS << "Functions shown: " << ShownFunctions << "\n";
diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index a907dfcf2cec5..8ae05c4ddc59a 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -518,6 +518,71 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     << "Self-dependency prevented symbol from being marked ready";
 }
 
+TEST_F(CoreAPIsStandardTest, TestBasicQueryDependenciesReporting) {
+  // Test that dependencies are reported as expected.
+
+  bool DependenciesCallbackRan = false;
+
+  std::unique_ptr<MaterializationResponsibility> FooR;
+  std::unique_ptr<MaterializationResponsibility> BarR;
+
+  cantFail(JD.define(std::make_unique<SimpleMaterializationUnit>(
+      SymbolFlagsMap({{Foo, FooSym.getFlags()}}),
+      [&](std::unique_ptr<MaterializationResponsibility> R) {
+        FooR = std::move(R);
+      })));
+
+  cantFail(JD.define(std::make_unique<SimpleMaterializationUnit>(
+      SymbolFlagsMap({{Bar, BarSym.getFlags()}}),
+      [&](std::unique_ptr<MaterializationResponsibility> R) {
+        BarR = std::move(R);
+      })));
+
+  cantFail(JD.define(std::make_unique<SimpleMaterializationUnit>(
+      SymbolFlagsMap({{Baz, BazSym.getFlags()}}),
+      [&](std::unique_ptr<MaterializationResponsibility> R) {
+        cantFail(R->notifyResolved({{Baz, BazSym}}));
+        cantFail(R->notifyEmitted({}));
+      })));
+
+  // First issue a lookup for Foo and Bar so that we can put them
+  // into the required states for the test lookup below.
+  ES.lookup(
+      LookupKind::Static, makeJITDylibSearchOrder(&JD),
+      SymbolLookupSet({Foo, Bar}), SymbolState::Resolved,
+      [](Expected<SymbolMap> Result) {
+        EXPECT_THAT_EXPECTED(std::move(Result), Succeeded());
+      },
+      NoDependenciesToRegister);
+
+  cantFail(FooR->notifyResolved({{Foo, FooSym}}));
+  cantFail(FooR->notifyEmitted({}));
+
+  cantFail(BarR->notifyResolved({{Bar, BarSym}}));
+
+  ES.lookup(
+      LookupKind::Static, makeJITDylibSearchOrder(&JD),
+      SymbolLookupSet({Foo, Bar, Baz}), SymbolState::Resolved,
+      [](Expected<SymbolMap> Result) {
+        EXPECT_THAT_EXPECTED(std::move(Result), Succeeded());
+      },
+      [&](const SymbolDependenceMap &Dependencies) {
+        EXPECT_EQ(Dependencies.size(), 1U)
+            << "Expect dependencies on only one JITDylib";
+        EXPECT_TRUE(Dependencies.count(&JD))
+            << "Expect dependencies on JD only";
+        auto &Deps = Dependencies.begin()->second;
+        EXPECT_EQ(Deps.size(), 2U);
+        EXPECT_TRUE(Deps.count(Bar));
+        EXPECT_TRUE(Deps.count(Baz));
+        DependenciesCallbackRan = true;
+      });
+
+  cantFail(BarR->notifyEmitted({}));
+
+  EXPECT_TRUE(DependenciesCallbackRan);
+}
+
 TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
   // Test that a circular symbol dependency between three symbols in a JITDylib
   // does not prevent any symbol from becoming 'ready' once all symbols are
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 0af812564c026..b4dbc4ed435aa 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1923,5 +1923,27 @@ TEST(InstructionsTest, AtomicSyncscope) {
   EXPECT_TRUE(LLVMIsAtomicSingleThread(CmpXchg));
 }
 
+TEST(InstructionsTest, CmpPredicate) {
+  CmpPredicate P0(CmpInst::ICMP_ULE, false), P1(CmpInst::ICMP_ULE, true),
+      P2(CmpInst::ICMP_SLE, false), P3(CmpInst::ICMP_SLT, false);
+  CmpPredicate Q0 = P0, Q1 = P1, Q2 = P2;
+  CmpInst::Predicate R0 = P0, R1 = P1, R2 = P2;
+
+  EXPECT_EQ(*CmpPredicate::getMatching(P0, P1), CmpInst::ICMP_ULE);
+  EXPECT_EQ(CmpPredicate::getMatching(P0, P1)->hasSameSign(), false);
+  EXPECT_EQ(*CmpPredicate::getMatching(P1, P1), CmpInst::ICMP_ULE);
+  EXPECT_EQ(CmpPredicate::getMatching(P1, P1)->hasSameSign(), true);
+  EXPECT_EQ(CmpPredicate::getMatching(P0, P2), std::nullopt);
+  EXPECT_EQ(*CmpPredicate::getMatching(P1, P2), CmpInst::ICMP_SLE);
+  EXPECT_EQ(CmpPredicate::getMatching(P1, P2)->hasSameSign(), false);
+  EXPECT_EQ(CmpPredicate::getMatching(P1, P3), std::nullopt);
+  EXPECT_FALSE(Q0.hasSameSign());
+  EXPECT_TRUE(Q1.hasSameSign());
+  EXPECT_FALSE(Q2.hasSameSign());
+  EXPECT_EQ(P0, R0);
+  EXPECT_EQ(P1, R1);
+  EXPECT_EQ(P2, R2);
+}
+
 } // end anonymous namespace
 } // end namespace llvm
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index 18e0b8fd135bb..0846f66ea6452 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -157,7 +157,8 @@ static bool doesSuffixLookLikeMangledType(StringRef Suffix) {
     return false;
 
   // [pi][0-9]+
-  if (is_contained("pi", Suffix[0]) && all_of(Suffix.drop_front(), isDigit))
+  if (Suffix.size() > 1 && is_contained("pi", Suffix[0]) &&
+      all_of(Suffix.drop_front(), isDigit))
     return true;
 
   // Match one of the named types.
diff --git a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
index 945d31afca10f..c7c9459fdff16 100644
--- a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
@@ -6,10 +6,12 @@ unittest("CoreTests") {
   deps = [
     "//bolt/lib/Core",
     "//bolt/lib/Rewrite",
+    "//bolt/lib/Profile",
     "//llvm/lib/DebugInfo/DWARF",
     "//llvm/lib/MC",
     "//llvm/lib/Object",
     "//llvm/lib/Target:TargetsToBuild",
+    "//llvm/lib/Testing/Support",
   ]
   sources = [
     "BinaryContext.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 59ecb66f2bcb0..ab72ac4ae9f4b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -43,6 +43,7 @@ static_library("CodeGen") {
     "CodeGen.cpp",
     "CodeGenCommonISel.cpp",
     "CodeGenPrepare.cpp",
+    "CodeGenTargetMachineImpl.cpp",
     "CommandFlags.cpp",
     "ComplexDeinterleavingPass.cpp",
     "CriticalAntiDepBreaker.cpp",
@@ -83,7 +84,6 @@ static_library("CodeGen") {
     "IntrinsicLowering.cpp",
     "JMCInstrumenter.cpp",
     "KCFI.cpp",
-    "CodeGenTargetMachineImpl.cpp",
     "LatencyPriorityQueue.cpp",
     "LazyMachineBlockFrequencyInfo.cpp",
     "LexicalScopes.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Shared/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Shared/BUILD.gn
index d152bc8c0c2d3..e66271a4d5f1e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Shared/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/Shared/BUILD.gn
@@ -8,5 +8,6 @@ static_library("Shared") {
     "OrcError.cpp",
     "OrcRTBridge.cpp",
     "SimpleRemoteEPCUtils.cpp",
+    "SymbolStringPool.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index cc24d9f4449cd..745179213ae32 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -179,6 +179,8 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUPromoteAlloca.cpp",
     "AMDGPUPromoteKernelArguments.cpp",
     "AMDGPURegBankCombiner.cpp",
+    "AMDGPURegBankLegalize.cpp",
+    "AMDGPURegBankSelect.cpp",
     "AMDGPURegisterBankInfo.cpp",
     "AMDGPURemoveIncompatibleFunctions.cpp",
     "AMDGPUReserveWWMRegs.cpp",
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 0e6434073437a..ed9b23c343150 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -175,17 +175,17 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMComdatAttrGet(MlirContext ctx,
                                                        MlirLLVMComdat comdat);
 
 enum MlirLLVMLinkage {
-  MlirLLVMLinkagePrivate = 0,
-  MlirLLVMLinkageInternal = 1,
-  MlirLLVMLinkageAvailableExternally = 2,
-  MlirLLVMLinkageLinkonce = 3,
+  MlirLLVMLinkageExternal = 0,
+  MlirLLVMLinkageAvailableExternally = 1,
+  MlirLLVMLinkageLinkonce = 2,
+  MlirLLVMLinkageLinkonceODR = 3,
   MlirLLVMLinkageWeak = 4,
-  MlirLLVMLinkageCommon = 5,
+  MlirLLVMLinkageWeakODR = 5,
   MlirLLVMLinkageAppending = 6,
-  MlirLLVMLinkageExternWeak = 7,
-  MlirLLVMLinkageLinkonceODR = 8,
-  MlirLLVMLinkageWeakODR = 9,
-  MlirLLVMLinkageExternal = 10,
+  MlirLLVMLinkageInternal = 7,
+  MlirLLVMLinkagePrivate = 8,
+  MlirLLVMLinkageExternWeak = 9,
+  MlirLLVMLinkageCommon = 10,
 };
 typedef enum MlirLLVMLinkage MlirLLVMLinkage;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
index 4a43c16903394..c08b75de03647 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMEnums.td
@@ -615,40 +615,40 @@ def ICmpPredicate : LLVM_EnumAttr<
 //===----------------------------------------------------------------------===//
 
 // Linkage attribute is used on functions and globals. The order follows that of
-// https://llvm.org/docs/LangRef.html#linkage-types. The names are equivalent to
-// visible names in the IR rather than to enum values names in llvm::GlobalValue
-// since the latter is easier to change.
-def LinkagePrivate
-    : LLVM_EnumAttrCase<"Private", "private", "PrivateLinkage", 0>;
-def LinkageInternal
-    : LLVM_EnumAttrCase<"Internal", "internal", "InternalLinkage", 1>;
+// llvm::GlobalValue::LinkageTypes from llvm/IR/GlobalValue.h. The names are
+// equivalent to visible names in the IR rather than to enum values names in
+// llvm::GlobalValue since the latter is easier to change.
+def LinkageExternal
+    : LLVM_EnumAttrCase<"External", "external", "ExternalLinkage", 0>;
 def LinkageAvailableExternally
     : LLVM_EnumAttrCase<"AvailableExternally", "available_externally",
-                        "AvailableExternallyLinkage", 2>;
+                        "AvailableExternallyLinkage", 1>;
 def LinkageLinkonce
-    : LLVM_EnumAttrCase<"Linkonce", "linkonce", "LinkOnceAnyLinkage", 3>;
+    : LLVM_EnumAttrCase<"Linkonce", "linkonce", "LinkOnceAnyLinkage", 2>;
+def LinkageLinkonceODR
+    : LLVM_EnumAttrCase<"LinkonceODR", "linkonce_odr", "LinkOnceODRLinkage", 3>;
 def LinkageWeak
     : LLVM_EnumAttrCase<"Weak", "weak", "WeakAnyLinkage", 4>;
-def LinkageCommon
-    : LLVM_EnumAttrCase<"Common", "common", "CommonLinkage", 5>;
+def LinkageWeakODR
+    : LLVM_EnumAttrCase<"WeakODR", "weak_odr", "WeakODRLinkage", 5>;
 def LinkageAppending
     : LLVM_EnumAttrCase<"Appending", "appending", "AppendingLinkage", 6>;
+def LinkageInternal
+    : LLVM_EnumAttrCase<"Internal", "internal", "InternalLinkage", 7>;
+def LinkagePrivate
+    : LLVM_EnumAttrCase<"Private", "private", "PrivateLinkage", 8>;
 def LinkageExternWeak
-   : LLVM_EnumAttrCase<"ExternWeak", "extern_weak", "ExternalWeakLinkage", 7>;
-def LinkageLinkonceODR
-    : LLVM_EnumAttrCase<"LinkonceODR", "linkonce_odr", "LinkOnceODRLinkage", 8>;
-def LinkageWeakODR
-    : LLVM_EnumAttrCase<"WeakODR", "weak_odr", "WeakODRLinkage", 9>;
-def LinkageExternal
-    : LLVM_EnumAttrCase<"External", "external", "ExternalLinkage", 10>;
+   : LLVM_EnumAttrCase<"ExternWeak", "extern_weak", "ExternalWeakLinkage", 9>;
+def LinkageCommon
+    : LLVM_EnumAttrCase<"Common", "common", "CommonLinkage", 10>;
 
 def LinkageEnum : LLVM_EnumAttr<
     "Linkage",
     "::llvm::GlobalValue::LinkageTypes",
     "LLVM linkage types",
-    [LinkagePrivate, LinkageInternal, LinkageAvailableExternally,
-     LinkageLinkonce, LinkageWeak, LinkageCommon, LinkageAppending,
-     LinkageExternWeak, LinkageLinkonceODR, LinkageWeakODR, LinkageExternal]> {
+    [LinkageExternal, LinkageAvailableExternally, LinkageLinkonce,
+      LinkageLinkonceODR, LinkageWeak, LinkageWeakODR, LinkageAppending,
+      LinkageInternal, LinkagePrivate, LinkageExternWeak, LinkageCommon]> {
   let cppNamespace = "::mlir::LLVM::linkage";
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 6de525bbb127c..92007f90166f2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -832,7 +832,7 @@ def Tosa_PowOp : Tosa_ElementwiseOp<"pow", [SameOperandsAndResultElementType]> {
   );
 
   let results = (outs
-    Tosa_Tensor:$z
+    Tosa_Tensor:$output
   );
 
   let hasCanonicalizer = 1;
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index a7222794f320b..6c1ff4d0e5e6b 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -1604,7 +1604,8 @@ class OpAsmParser : public AsmParser {
     size_t typeSize = llvm::range_size(types);
     if (operandSize != typeSize)
       return emitError(loc)
-             << operandSize << " operands present, but expected " << typeSize;
+             << "number of operands and types do not match: got " << operandSize
+             << " operands and " << typeSize << " types";
 
     for (auto [operand, type] : llvm::zip_equal(operands, types))
       if (resolveOperand(operand, type, result))
diff --git a/mlir/include/mlir/IR/VectorTypes.h b/mlir/include/mlir/IR/VectorTypes.h
new file mode 100644
index 0000000000000..c209f869a579d
--- /dev/null
+++ b/mlir/include/mlir/IR/VectorTypes.h
@@ -0,0 +1,51 @@
+//===- VectorTypes.h - MLIR Vector Types ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Convenience wrappers for `VectorType` to allow idiomatic code like
+//  * isa<vector::ScalableVectorType>(type)
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_IR_VECTORTYPES_H
+#define MLIR_IR_VECTORTYPES_H
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+namespace vector {
+
+/// A vector type containing at least one scalable dimension.
+class ScalableVectorType : public VectorType {
+public:
+  using VectorType::VectorType;
+
+  static bool classof(Type type) {
+    auto vecTy = llvm::dyn_cast<VectorType>(type);
+    if (!vecTy)
+      return false;
+    return vecTy.isScalable();
+  }
+};
+
+/// A vector type with no scalable dimensions.
+class FixedVectorType : public VectorType {
+public:
+  using VectorType::VectorType;
+  static bool classof(Type type) {
+    auto vecTy = llvm::dyn_cast<VectorType>(type);
+    if (!vecTy)
+      return false;
+    return !vecTy.isScalable();
+  }
+};
+
+} // namespace vector
+} // namespace mlir
+
+#endif // MLIR_IR_VECTORTYPES_H
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 74c64761565d6..fe7646140db7e 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -21,6 +21,8 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/VectorTypes.h"
+#include "mlir/Support/LogicalResult.h"
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -224,8 +226,8 @@ LogicalResult arith::ConstantOp::verify() {
   // Note, we could relax this for vectors with 1 scalable dim, e.g.:
   //  * arith.constant dense<[[3, 3], [1, 1]]> : vector<2 x [2] x i32>
   // However, this would most likely require updating the lowerings to LLVM.
-  auto vecType = dyn_cast<VectorType>(type);
-  if (vecType && vecType.isScalable() && !isa<SplatElementsAttr>(getValue()))
+  if (isa<vector::ScalableVectorType>(type) &&
+      !isa<SplatElementsAttr>(getValue()))
     return emitOpError(
         "intializing scalable vectors with elements attribute is not supported"
         " unless it's a vector splat");
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 0c0a7bc98d8b5..ad709813c6216 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1756,11 +1756,6 @@ static Value foldExtractFromShapeCast(ExtractOp extractOp) {
   if (!shapeCastOp)
     return Value();
 
-  // 0-D vectors not supported.
-  assert(!hasZeroDimVectors(extractOp) && "0-D vectors not supported");
-  if (hasZeroDimVectors(shapeCastOp))
-    return Value();
-
   // Get the nth dimension size starting from lowest dimension.
   auto getDimReverse = [](VectorType type, int64_t n) {
     return type.getShape().take_back(n + 1).front();
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 0aa9dcb36681b..dbce4a540dcfb 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -191,10 +191,10 @@ static void cleanSimpleOp(Operation *op, RunLivenessAnalysis &la) {
 ///   non-live across all callers),
 ///   (5) Dropping the uses of these return values from its callers, AND
 ///   (6) Erasing these return values
-/// iff it is not public or declaration.
+/// iff it is not public or external.
 static void cleanFuncOp(FunctionOpInterface funcOp, Operation *module,
                         RunLivenessAnalysis &la) {
-  if (funcOp.isPublic() || funcOp.isDeclaration())
+  if (funcOp.isPublic() || funcOp.isExternal())
     return;
 
   // Get the list of unnecessary (non-live) arguments in `nonLiveArgs`.
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 5677d7ff41202..25806d9d0edd7 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -91,14 +91,14 @@ func.func @alloca_non_integer_alignment() {
 // -----
 
 func.func @gep_missing_input_result_type(%pos : i64, %base : !llvm.ptr) {
-  // expected-error@+1 {{2 operands present, but expected 0}}
+  // expected-error@+1 {{number of operands and types do not match: got 2 operands and 0 types}}
   llvm.getelementptr %base[%pos] : () -> (), i64
 }
 
 // -----
 
 func.func @gep_missing_input_type(%pos : i64, %base : !llvm.ptr) {
-  // expected-error@+1 {{2 operands present, but expected 0}}
+  // expected-error@+1 {{number of operands and types do not match: got 2 operands and 0 types}}
   llvm.getelementptr %base[%pos] : () -> (!llvm.ptr), i64
 }
 
diff --git a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
index fbebb97a11983..6584596cdfdb2 100644
--- a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
@@ -77,7 +77,7 @@ transform.sequence failures(propagate) {
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
   %0 = transform.param.constant 2 : i64 -> !transform.param<i64>
-  // expected-error@below {{custom op 'transform.structured.vectorize' 1 operands present, but expected 2}}
+  // expected-error@+1 {{custom op 'transform.structured.vectorize' number of operands and types do not match: got 1 operands and 2 types}}
   transform.structured.vectorize %arg0 vector_sizes [%0, 2] : !transform.any_op, !transform.param<i64>, !transform.param<i64>
 
 }
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 337eb9eeb8fa5..80576be880127 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -247,7 +247,7 @@ func.func @parallel_more_results_than_reduces(
 
 func.func @parallel_more_results_than_initial_values(
     %arg0 : index, %arg1: index, %arg2: index) {
-  // expected-error@+1 {{'scf.parallel' 0 operands present, but expected 1}}
+  // expected-error@+1 {{'scf.parallel' number of operands and types do not match: got 0 operands and 1 types}}
   %res = scf.parallel (%i0) = (%arg0) to (%arg1) step (%arg2) -> f32 {
     scf.reduce(%arg0 : index) {
       ^bb0(%lhs: index, %rhs: index):
@@ -609,7 +609,7 @@ func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  // expected-error @+1 {{1 operands present, but expected 2}}
+  // expected-error@+1 {{number of operands and types do not match: got 1 operands and 2 types}}
   %result:2 = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       scf.forall.in_parallel {
diff --git a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
index 5aef6135afd97..57ff94762ff68 100644
--- a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
@@ -57,7 +57,7 @@ func.func @access_chain_non_composite() -> () {
 
 func.func @access_chain_no_indices(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{custom op 'spirv.AccessChain' 0 operands present, but expected 1}}
+  // expected-error @+1 {{custom op 'spirv.AccessChain' number of operands and types do not match: got 0 operands and 1 types}}
   %1 = spirv.AccessChain %0[] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
@@ -75,7 +75,7 @@ func.func @access_chain_missing_comma(%index0 : i32) -> () {
 
 func.func @access_chain_invalid_indices_types_count(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{custom op 'spirv.AccessChain' 1 operands present, but expected 2}}
+  // expected-error @+1 {{custom op 'spirv.AccessChain' number of operands and types do not match: got 1 operands and 2 types}}
   %1 = spirv.AccessChain %0[%index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32, i32 -> !spirv.ptr<!spirv.array<4xf32>, Function>
   return
 }
@@ -84,7 +84,7 @@ func.func @access_chain_invalid_indices_types_count(%index0 : i32) -> () {
 
 func.func @access_chain_missing_indices_type(%index0 : i32) -> () {
   %0 = spirv.Variable : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>
-  // expected-error @+1 {{custom op 'spirv.AccessChain' 2 operands present, but expected 1}}
+  // expected-error @+1 {{custom op 'spirv.AccessChain' number of operands and types do not match: got 2 operands and 1 types}}
   %1 = spirv.AccessChain %0[%index0, %index0] : !spirv.ptr<!spirv.array<4x!spirv.array<4xf32>>, Function>, i32 -> !spirv.ptr<f32, Function>
   return
 }
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index 77cae1cc5f242..83cb4b9d4ab24 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -90,7 +90,7 @@ func.func @tensor.from_elements_wrong_result_type() {
 // -----
 
 func.func @tensor.from_elements_wrong_elements_count() {
-  // expected-error@+2 {{1 operands present, but expected 2}}
+  // expected-error@+2 {{number of operands and types do not match: got 1 operands and 2 types}}
   %c0 = arith.constant 0 : index
   %0 = tensor.from_elements %c0 : tensor<2xindex>
   return
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 5ae769090dac6..89af0f7332f5c 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -782,23 +782,23 @@ func.func @fold_extract_shapecast(%arg0 : vector<5x1x3x2xf32>,
 
 // -----
 
-// CHECK-LABEL: fold_extract_shapecast_negative
-//       CHECK:   %[[V:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<2x4x2xf32>
-//       CHECK:   %[[R:.*]] = vector.extract %[[V]][1] : vector<4x2xf32> from vector<2x4x2xf32>
-//       CHECK:   return %[[R]] : vector<4x2xf32>
-func.func @fold_extract_shapecast_negative(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
-  %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<2x4x2xf32>
-  %r = vector.extract %0[1] : vector<4x2xf32> from vector<2x4x2xf32>
-  return %r : vector<4x2xf32>
+// CHECK-LABEL: fold_extract_shapecast_0d_result
+//  CHECK-SAME: %[[IN:.*]]: vector<1x1x1xf32>
+//       CHECK:   %[[R:.*]] = vector.extract %[[IN]][0, 0, 0] : f32 from vector<1x1x1xf32>
+//       CHECK:   return %[[R]] : f32
+func.func @fold_extract_shapecast_0d_result(%arg0 : vector<1x1x1xf32>) -> f32 {
+  %0 = vector.shape_cast %arg0 : vector<1x1x1xf32> to vector<f32>
+  %r = vector.extract %0[] : f32 from vector<f32>
+  return %r : f32
 }
 
 // -----
 
-// CHECK-LABEL: dont_fold_0d_extract_shapecast
-//       CHECK:   %[[V:.*]] = vector.shape_cast %{{.*}} : vector<f32> to vector<1xf32>
-//       CHECK:   %[[R:.*]] = vector.extract %[[V]][0] : f32 from vector<1xf32>
+// CHECK-LABEL: fold_extract_shapecast_0d_source
+//  CHECK-SAME: %[[IN:.*]]: vector<f32>
+//       CHECK:   %[[R:.*]] = vector.extract %[[IN]][] : f32 from vector<f32>
 //       CHECK:   return %[[R]] : f32
-func.func @dont_fold_0d_extract_shapecast(%arg0 : vector<f32>) -> f32 {
+func.func @fold_extract_shapecast_0d_source(%arg0 : vector<f32>) -> f32 {
   %0 = vector.shape_cast %arg0 : vector<f32> to vector<1xf32>
   %r = vector.extract %0[0] : f32 from vector<1xf32>
   return %r : f32
@@ -806,6 +806,18 @@ func.func @dont_fold_0d_extract_shapecast(%arg0 : vector<f32>) -> f32 {
 
 // -----
 
+// CHECK-LABEL: fold_extract_shapecast_negative
+//       CHECK:   %[[V:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<2x4x2xf32>
+//       CHECK:   %[[R:.*]] = vector.extract %[[V]][1] : vector<4x2xf32> from vector<2x4x2xf32>
+//       CHECK:   return %[[R]] : vector<4x2xf32>
+func.func @fold_extract_shapecast_negative(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
+  %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<2x4x2xf32>
+  %r = vector.extract %0[1] : vector<4x2xf32> from vector<2x4x2xf32>
+  return %r : vector<4x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: fold_extract_shapecast_to_shapecast
 //  CHECK-SAME: (%[[ARG:.+]]: vector<3x4xf32>)
 //       CHECK:   %[[R:.+]] = vector.shape_cast %[[ARG]] : vector<3x4xf32> to vector<12xf32>
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 9f7efa15ed520..1a70791fae125 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1803,7 +1803,7 @@ func.func @deinterleave_scalable_rank_fail(%vec : vector<2x[4]xf32>) {
 // -----
 
 func.func @invalid_from_elements(%a: f32) {
-  // expected-error @+1 {{'vector.from_elements' 1 operands present, but expected 2}}
+  // expected-error @+1 {{'vector.from_elements' number of operands and types do not match: got 1 operands and 2 types}}
   vector.from_elements %a : vector<2xf32>
   return
 }
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 826f6159a36b6..538755291e81a 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -377,3 +377,8 @@ func.func @kernel(%arg0: memref<18xf32>) {
 
 // CHECK: func.func private @no_block_func_declaration()
 func.func private @no_block_func_declaration() -> ()
+
+// -----
+
+// CHECK: llvm.func @no_block_external_func()
+llvm.func @no_block_external_func() attributes {sym_visibility = "private"}
diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt
index 6d612548a94c0..0c61d222dedf4 100644
--- a/mlir/unittests/Target/LLVM/CMakeLists.txt
+++ b/mlir/unittests/Target/LLVM/CMakeLists.txt
@@ -1,11 +1,11 @@
+set(LLVM_LINK_COMPONENTS nativecodegen)
+
 add_mlir_unittest(MLIRTargetLLVMTests
   SerializeNVVMTarget.cpp
   SerializeROCDLTarget.cpp
   SerializeToLLVMBitcode.cpp
 )
 
-llvm_map_components_to_libnames(llvm_libs nativecodegen)
-
 target_link_libraries(MLIRTargetLLVMTests
   PRIVATE
   MLIRTargetLLVM
@@ -19,7 +19,6 @@ target_link_libraries(MLIRTargetLLVMTests
   MLIRNVVMToLLVMIRTranslation
   MLIRROCDLToLLVMIRTranslation
   MLIRGPUToLLVMIRTranslation
-  ${llvm_libs}
 )
 
 if (DEFINED LLVM_NATIVE_TARGET)
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 3da83e5c30713..b52305b9516fb 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -131,7 +131,7 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
              -DOMPTARGET_DEVICE_RUNTIME
              -I${include_directory}
              -I${devicertl_base_directory}/../include
-             -I${LLVM_MAIN_SRC_DIR}/../libc
+             -I${devicertl_base_directory}/../../libc
              ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
 )
 
@@ -276,7 +276,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
     target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
     target_include_directories(${ide_target_name} PRIVATE
       ${include_directory}
-      ${LLVM_MAIN_SRC_DIR}/../libc
+      ${devicertl_base_directory}/../../libc
       ${devicertl_base_directory}/../include
       ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
     )
diff --git a/openmp/Maintainers.md b/openmp/Maintainers.md
new file mode 100644
index 0000000000000..1892577374e6f
--- /dev/null
+++ b/openmp/Maintainers.md
@@ -0,0 +1,13 @@
+# LLVM OpenMP Library Maintainers
+
+This file is a list of the
+[maintainers](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
+the LLVM OpenMP library.
+
+# Current Maintainers
+
+Michael Klemm \
+michael.klemm@amd.com (email), [mjklemm](https://github.com/mjklemm) (GitHub)
+
+Terry Wilmarth \
+terry.l.wilmarth@intel.com (email), [TerryLWilmarth](https://github.com/TerryLWilmarth) (GitHub)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 2573788658d59..c5a0076d2ef30 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -740,6 +740,7 @@ libc_support_library(
     hdrs = ["src/__support/integer_literals.h"],
     deps = [
         ":__support_cpp_limits",
+        ":__support_ctype_utils",
         ":__support_uint128",
     ],
 )
@@ -772,6 +773,7 @@ libc_support_library(
         ":__support_cpp_span",
         ":__support_cpp_string_view",
         ":__support_cpp_type_traits",
+        ":__support_ctype_utils",
     ],
 )
 
@@ -4450,6 +4452,7 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_span",
         ":__support_cpp_string_view",
+        ":__support_ctype_utils",
         ":__support_float_to_string",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 7d135b465bce1..82e65a728bc61 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -74,7 +74,6 @@ def libc_function(
             for.
       srcs: The .cpp files which contain the function implementation.
       weak: Make the symbol corresponding to the libc function "weak".
-      deps: The list of target dependencies if any.
       copts: The list of options to add to the C++ compilation command.
       local_defines: The preprocessor defines which will be prepended with -D
                      and passed to the compile command of this target but not
@@ -138,9 +137,6 @@ def libc_math_function(
 
     Args:
       name: The name of the function.
-      specializations: List of machine specializations available for this
-                       function. Possible specializations are "generic",
-                       "aarch64" and "x86_64".
       additional_deps: Other deps like helper cc_library targes used by the
                        math function.
     """
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index f387741e95d8b..6db3456edbb70 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -63,12 +63,12 @@ libc_support_library(
         "//libc:__support_stringutil",
         "//libc:__support_uint128",
         "//libc:errno",
-        "//libc:llvm_libc_macros_stdfix_macros",
-        "//llvm:Support",
         "//libc:func_aligned_alloc",
         "//libc:func_free",
         "//libc:func_malloc",
         "//libc:func_realloc",
+        "//libc:llvm_libc_macros_stdfix_macros",
+        "//llvm:Support",
     ],
 )
 
@@ -121,6 +121,7 @@ libc_support_library(
         "//libc:__support_cpp_bitset",
         "//libc:__support_cpp_span",
         "//libc:__support_cpp_type_traits",
+        "//libc:__support_ctype_utils",
         "//libc:__support_macros_config",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 53b400ac0d593..e4b4b075705e8 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -161,6 +161,7 @@ libc_support_library(
     deps = [
         "//libc:__support_cpp_limits",
         "//libc:__support_cpp_type_traits",
+        "//libc:__support_ctype_utils",
         "//libc:__support_macros_properties_architectures",
         "//libc:errno.__internal__",
         "//libc/test/UnitTest:LibcUnitTest",
diff --git a/utils/bazel/llvm_configs/abi-breaking.h.cmake b/utils/bazel/llvm_configs/abi-breaking.h.cmake
index 81495f0569752..2d27e02b1d545 100644
--- a/utils/bazel/llvm_configs/abi-breaking.h.cmake
+++ b/utils/bazel/llvm_configs/abi-breaking.h.cmake
@@ -12,8 +12,6 @@
 #ifndef LLVM_ABI_BREAKING_CHECKS_H
 #define LLVM_ABI_BREAKING_CHECKS_H
 
-#include "llvm/Support/Compiler.h"
-
 /* Define to enable checks that alter the LLVM C++ ABI */
 #cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
 
@@ -45,12 +43,12 @@
 #endif
 namespace llvm {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
-LLVM_ABI extern int EnableABIBreakingChecks;
+extern int EnableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyEnableABIBreakingChecks =
     &EnableABIBreakingChecks;
 #else
-LLVM_ABI extern int DisableABIBreakingChecks;
+extern int DisableABIBreakingChecks;
 LLVM_HIDDEN_VISIBILITY
 __attribute__((weak)) int *VerifyDisableABIBreakingChecks =
     &DisableABIBreakingChecks;